This code was used for the analyses of Cariaco Basin metagenomes and metatranscriptomes for the 2023 publication “Diverse secondary metabolites are expressed in particle-associated and free-living microorganisms of the permanently anoxic Cariaco Basin” by Geller-McGrath et al. Some sections of these analyses were done on a compute node of the Poseidon High Performance Computing (HPC) cluster based at the Woods Hole Oceanographic Institute (WHOI). All of the analyses done here using R can be run locally; HPC processing was used to speed up computations.


Load required R libraries

library(tidyverse)
library(ComplexHeatmap)
library(readxl)
library(umap)
library(cowplot)
library(dbscan)
library(ggtext) # used for ggplots
library(glue) # used for ggplots
library(cowplot) # for combining various plots together in grids
library(gridtext) # for editing text in heatmap plots
library(ComplexHeatmap) # for heatmap plots
library(pheatmap) # for heatmap plots
library(janitor)
library(ggfortify)
library(grid)


Figure 1. Normalized biosynthetic gene cluster count per phylum

# BGC frequency plot by phylum --------------------------------------------
bgc_count_data_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/bgc_count_data_summary.rds')

smc = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/smc.rds')


highlight <- function(x, pat, color = "black", family = "") {
  ifelse(grepl(pat, x), glue("<b style='font-family:{family}; color:{color}'>{x}</b>"), x)
}

face2 = bgc_count_data_summary %>%
  select(phylum, domain, n_mags_in_phylum) %>%
  distinct() %>%
  filter(n_mags_in_phylum == 1) %>%
  mutate(type = 'bold')


bact_final_sm_freq_plot = bgc_count_data_summary %>%
  filter(domain == 'Bacteria') %>%
  ggplot(aes(as_factor(phylum), norm_sm_freq,
             fill = fct_rev(fct_infreq(product)))) +
  geom_bar(stat = 'identity') +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 70, hjust = 1, size = 10),
        axis.text.y = element_text(size = 10),
        axis.title = element_text(size = 12),
        legend.position = c(0.98, 0.98),
        legend.justification = c(0.98, 0.98),
        legend.text = element_text(size = 11),
        legend.title = element_text(size = 12),
        legend.background = element_blank(), panel.border =
          element_rect(color = 'black', fill = NA),
        legend.box.background = element_rect(color = 'black'),
        panel.grid = element_blank(),
        strip.text = element_text(size = 13),
        axis.title.y = element_blank(),
        strip.background = element_rect(fill = 'wheat1')
  ) +
  labs(x = '\nPhylum',
       fill = 'Secondary metabolite class') +
  scale_fill_manual(values = smc) +
  facet_grid(~ domain, space = 'free', scales = 'free') +
  scale_x_discrete(labels = function(x) highlight(x, paste0(face2$phylum, collapse = '|'), 'black')) +
  theme(axis.text.x = element_markdown())# +


arch_final_sm_freq_plot = bgc_count_data_summary %>%
  filter(domain == 'Archaea') %>%
  ggplot(aes(as_factor(phylum), norm_sm_freq,
             fill = fct_rev(fct_infreq(product)))) +
  geom_bar(stat = 'identity') +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 70, hjust = 1),
        axis.text = element_text(size = 10),
        axis.title = element_text(size = 12),
        legend.position = 'none',
        panel.grid = element_blank(),
        strip.text = element_text(size = 13),
        axis.title.y = element_blank(),
        strip.background = element_rect(fill = 'wheat1')
  ) +
  labs(x = '\nPhylum',
       fill = 'Secondary metabolite class') +
  scale_fill_manual(values = smc) +
  facet_grid(~ domain, space = 'free', scales = 'free') +
  scale_x_discrete(labels = function(x) highlight(x, paste0(face2$phylum, collapse = '|'), 'black')) +
  theme(axis.text.x = element_markdown())


# save the sm biosynthetic potential plot
sm_freq_plot = plot_grid(bact_final_sm_freq_plot, NULL, arch_final_sm_freq_plot,
                         labels = c('a', '', 'b'),
                         rel_widths = c(3, 0.02, 1),
                         nrow = 1,
                         align = 'hv'#,
                         #label_x = c(0.0975, 0, 0)
)
sm_freq_plot


Figure 2. Expression of secondary metabolite biosynthetic transcripts from individual MAGs in metatranscriptomic samples

mm2_final = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')

metaT_col_order = tibble(
  sample_name = colnames(mm2_final)[2:ncol(mm2_final)],
  fraction = if_else(
    str_detect(sample_name, 'PA'), 'PA', 'FL'),
  depth = str_replace(
    sample_name, '.*(\\d{3}).*', '\\1')
) %>%
  arrange(desc(fraction), depth) %>%
  pull(sample_name)


# create color specifications for column annotations
anno_colors = list(
  Fraction = c(
    'PA' = 'darkgreen',
    'FL' = 'darkseagreen2'),
  Layer = c(
    'Oxycline' = 'darkslategray3', #'wheat2',
    'Shallow anoxic' = 'gold',# 'khaki2',
    'Euxinic' =  'sienna3'),

  Season = c('May' = 'slategray',
             'November' = 'slategray1'),

  `Depth (m)` = c(
    '103' = 'grey90',
    '148' = 'grey85',
    '198' = 'grey80',
    '200' = 'grey75',
    '234' = 'grey70',
    '237' = 'grey65',
    '247' = 'grey60',
    '267' = 'grey55',
    '295' = 'grey50',
    '314' = 'grey45',
    '900' = 'grey20'
  )
)


# final plot used in manuscript
mm2_final_mod =
  mm2_final %>%
  mutate(group_col = str_replace(
    gene_name, '(MAG_\\d+-ctg\\d+)_.*', '\\1'), .before = 1) %>%
  relocate(group_col) %>%
  mutate(rowSums = rowSums(.[3:ncol(.)]), .after = 1) %>%
  mutate(rs_pa = rowSums(.[str_detect(colnames(.), 'PA')]),
         rs_fl = rowSums(.[str_detect(colnames(.), 'FL')])) %>%
  relocate(c(rs_pa, rs_fl), .after = 1) %>%
  filter(
    rowSums > 19
  ) %>%
  group_by(group_col) %>%
  mutate(group_PA_rowSums = sum(rs_pa), .after = 1) %>%
  mutate(group_FL_rowSums = sum(rs_fl), .after = 1) %>%
  mutate(sorting_col = if_else(
    group_PA_rowSums > group_FL_rowSums, 'PA', 'FL'), .after = group_PA_rowSums) %>%
  ungroup() %>%
  arrange(desc(group_PA_rowSums), sorting_col) %>%
  select(-c(rowSums, group_PA_rowSums, group_FL_rowSums,
            rs_pa, rs_fl, group_col, sorting_col)) %>%
  column_to_rownames(var = 'gene_name') %>%
  relocate(all_of(metaT_col_order))


mm2_final_mod =
  mm2_final_mod %>%

  mutate(pa_sums = rowSums(.[str_detect(colnames(.), 'PA')]),
         fl_sums = rowSums(.[str_detect(colnames(.), 'FL')]) ) %>%
  relocate(c(pa_sums, fl_sums), .before = 1) %>%
  mutate(row_sorter = if_else(pa_sums > fl_sums, TRUE, FALSE), .before = 1) %>%
  arrange(desc(row_sorter), desc(pa_sums)) %>%
  select(-c(pa_sums, fl_sums, row_sorter)) %>%

  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 6, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}())

metaT_row_anno = tibble(
  gene_name = rownames(mm2_final_mod),
  mag_name = str_replace(gene_name, '(MAG_\\d+)-.*', '\\1')
) %>%
  column_to_rownames(var = 'gene_name')


# create column annotation information object
col_anno = tibble(
  sample_name = colnames(mm2_final_mod),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name')


mm2_final_plot =
  mm2_final_mod %>%
  rownames_to_column(var = 'gene_name') %>%
  mutate(mag_name = str_replace(
    gene_name, '(MAG_\\d+)-.*', '\\1')) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%
  select(-mag_name) %>%
  column_to_rownames(var = 'gene_name') %>%
  as.matrix() %>%

  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    show_colnames = FALSE,
    annotation_col = col_anno,
    #annotation_row = metaT_row_anno,
    annotation_colors = anno_colors,
    gaps_col = 24,
    clustering_method = 'mcquitty',
    #cluster_rows = FALSE,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    border_color = 'grey60',
    treeheight_row = 0,
    treeheight_col = 0,
    color =
      c(#colorRampPalette(colors = 'white')(100),
        colorRampPalette(colors = c(
          'gray98',
          '#aae6fa',
          '#c9e6f0',
          '#f7e96d',
          '#f7e43b',
          'goldenrod1',
          'orange',
          '#ff6a00',
          '#D73027',
          'purple4'))(1000)
      ),
    legend_breaks = seq(0, 6, by = 2),
    legend_labels = gt_render(c(
      '0',
      '6.39 x 10^0',
      '5.36 x 10^1',
      '4.02 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )



mm2_final_plot = ComplexHeatmap::draw(
  mm2_final_plot,
  merge_legend = TRUE,
  heatmap_legend_side = 'right',
  annotation_legend_side = 'right'
)


Figure 3. UMAP analysis for read mapping data of particle-associated and free-living metagenomes (a) and metatranscriptomes (b) to all BGCs longer than 10kb total length detected in Cariaco MAGs

# Metatranscriptome UMAP analysis ----------------------------------------------------
setwd('~/Documents/cariaco-tidy-analysis/')
rnaseq_mapping_counts = readRDS('rdata-files/rnaseq_read_mapping_minimap2_on_cariaco_genes.rds')

bgc_genes_10kb = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/bgc_genes_10kb.rds')

mm2_final = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')

col_metadata = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/col_metadata.rds')

rnaseq_umap_tbl = mm2_final |>
  filter(gene_name %in% bgc_genes_10kb$gene_name) |>
  mutate(contig = str_replace(gene_name, '(MAG_\\d+)-ctg(.*)_\\d+', '\\1_\\2'),
         mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) |>
  relocate(c(mag_name, contig), .before = 1) |>
  select(-gene_name) |>
  group_by(mag_name, contig) |>
  summarize(across(everything(), ~ sum(.x))) |>
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983'))
## `summarise()` has grouped output by 'mag_name'. You can override using the
## `.groups` argument.
# need the environmental metadata
cariaco_env_sheets = excel_sheets('~/Documents/cariaco-tidy-analysis/stables/Cariaco_Supplementary_files_2022.xlsx')

cariaco_env_vars = read_excel('~/Documents/cariaco-tidy-analysis/stables/Cariaco_Supplementary_files_2022.xlsx',
                              sheet = cariaco_env_sheets[6])

cariaco_env_vars = cariaco_env_vars |>
  filter(sample_type == 'metagenome') |>
  select(-c(ncbi_metagenome_name, ncbi_metatranscriptome_name,
            replicate, sample_type, year))

cariaco_env_vars = cariaco_env_vars |>
  mutate(across(where(is.character), ~ if_else(.x == 'NA', NA_character_, .x))) |>
    select(-c(NO3, H2S))

res_umap2 =
  rnaseq_umap_tbl |>
    ungroup() |>
    select(-mag_name) |>
    pivot_longer(
      2:last_col(),
      names_to = 'sample',
      values_to = 'count') |>
    pivot_wider(
      names_from = contig,
      values_from = count) |>
    column_to_rownames('sample') |>
    select(-where(~ sum(.x) == 0)) |>
    as.matrix() |>
    umap()


rna_hdbscan_res = hdbscan(res_umap2$layout, minPts = 5)
rna_hdbscan_res$cluster
##  [1] 2 2 2 2 2 2 2 2 2 2 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 2 2 2 2 2 2 2 0 1 1
## [39] 1 1 0 1 1 1 1 1 1 1
umap_df2 <- res_umap2$layout |>
  as.data.frame() |>
  mutate(cluster = rna_hdbscan_res$cluster) |>
  rename(UMAP1="V1",
         UMAP2="V2") |>
  rownames_to_column('sample') |>
  as_tibble() |>
  left_join(cariaco_env_vars, by = c('sample' = 'sample_name_used_for_this_study')) |>
  filter(!is.na(depth))



# Metagenome UMAP analysis ----------------------------------------------------
dnaseq_umap_tbl = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/metaG_processed_mapping_tbl.rds')

dnaseq_umap_tbl = dnaseq_umap_tbl |>  #rnaseq_mapping_counts |>
  filter(gene_name %in% bgc_genes_10kb$gene_name) |>
  mutate(contig = str_replace(gene_name, '(MAG_\\d+)-ctg(.*)_\\d+', '\\1_\\2'),
         mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) |>
  relocate(c(mag_name, contig), .before = 1) |>
  select(-gene_name) |>
  group_by(mag_name, contig) |>
  summarize(across(everything(), ~ mean(.x))) |>
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983'))
## `summarise()` has grouped output by 'mag_name'. You can override using the
## `.groups` argument.
dnaseq_res_umap =
  dnaseq_umap_tbl |>
  ungroup() |>
  select(-mag_name) |>
  #column_to_rownames('contig') |>
  pivot_longer(
    2:last_col(),
    names_to = 'sample',
    values_to = 'count') |>
  pivot_wider(
    names_from = contig,
    values_from = count) |>

  left_join(col_metadata, by = c('sample' = 'column')) |>
  mutate(depth = str_replace(sample, '.*(\\d{3}).*', '\\1'),
         replicate = if_else(str_detect(sample, 'a'), 1, 2),
         new_colname = paste0(season, fraction, depth, '_', replicate)) |>
  select(-c(depth, replicate, season, fraction, sample, layer)) |>
  relocate(new_colname) |>
  column_to_rownames('new_colname') |>
  select(-where(~ sum(.x) == 0)) |>
  as.matrix() |>
  umap()

hdbscan_res_dna = hdbscan(dnaseq_res_umap$layout, minPts = 5)
hdbscan_res_dna$cluster
##  [1] 3 3 2 1 2 1 0 0 0 3 3 3 3 3 1 0 1 2 3 0 3 3 3 3 3 2 1 2 1 2 1 3 3 3 3 3 3 1
## [39] 0 2 1 2 1 3 3 3 3
dnaseq_umap_df <- dnaseq_res_umap$layout |>
  as.data.frame() |>
  mutate(cluster = hdbscan_res_dna$cluster) |>
  rename(UMAP1="V1",
         UMAP2="V2") |>
  rownames_to_column('sample') |>
  as_tibble() |>
  left_join(cariaco_env_vars, by = c('sample' = 'sample_name_used_for_this_study')) |>
  filter(!is.na(depth))



# create final joint umap plot - plotting code pasted here again ----------


#metaT plot
metaT_umap_result = umap_df2 |>
  mutate(
    size_fraction = if_else(
      size_fraction == 'free-living (0.2-2.7 um)',
      'Free-living (0.2-2.7 um)', 'Particle-associated (>2.7 um)'),

    redox_regime =
      str_to_title(redox_regime) |>
      paste0(' - ', size_fraction)
  ) |>
  ggplot(aes(x = UMAP1,
             y = UMAP2,
             color = time_point,
             shape = redox_regime
  )) +
  geom_point(size = 4.5) +
  labs(x = "\nUMAP1",
       y = "UMAP2\n",
       color = 'Time point',
       shape = 'Redox regime/size fraction') +
  theme_bw() +
  scale_shape_manual(
    values = c(0,15,1,16,2,17)) +
  scale_color_manual(values = c(
    'May' = '#009E73',
    'November' = '#CC79A7')
  ) +
  theme(
    axis.title = element_text(size = 13),
    axis.text = element_text(size = 12),
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    panel.grid = element_blank(),
    legend.position = 'top') +
  guides(shape = 'none')



# metaG plot
metaG_umap_result = dnaseq_umap_df |>
  mutate(
    size_fraction = if_else(
      size_fraction == 'free-living (0.2-2.7 um)',
      'Free-living (0.2-2.7 um)', 'Particle-associated (>2.7 um)'),

    redox_regime =
      str_to_title(redox_regime) |>
      paste0(' - ', size_fraction)
  ) |>
  ggplot(aes(x = UMAP1,
             y = UMAP2,
             color = time_point,
             shape = redox_regime
  )) +
  geom_point(size = 4.5) +
  labs(x = "\nUMAP1",
       y = "UMAP2\n",
       color = 'Time point',
       shape = 'Redox regime/size fraction'
  ) +
  theme_bw() +
  scale_shape_manual(
    values = c(0,15,1,16,2,17)) +
  scale_color_manual(values = c(
    'May' = '#E69F00',
    'November' = '#56B4E9')
  ) +
  theme(
    axis.title = element_text(size = 13),
    axis.text = element_text(size = 12),
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    panel.grid = element_blank(),
    legend.position = 'top') +
  guides(shape = 'none')


# just the metaT shape legend
metaT_shape_legend = umap_df2 |>
  mutate(
    size_fraction = if_else(
      size_fraction == 'free-living (0.2-2.7 um)',
      'Free-living (0.2-2.7 um)', 'Particle-associated (>2.7 um)'),

    redox_regime =
      str_to_title(redox_regime) |>
      paste0(' - ', size_fraction)
  ) |>
  ggplot(aes(x = UMAP1,
             y = UMAP2,
             color = time_point,
             shape = redox_regime
  )) +
  geom_point(size = 4.5) +
  labs(x = "\nUMAP1",
       y = "UMAP2\n",
       color = 'Time point',
       shape = 'Redox regime/size fraction') +
  theme_bw() +
  scale_shape_manual(
    values = c(0,15,1,16,2,17)) +
  scale_color_manual(values = c(
    'May' = '#009E73',
    'November' = '#CC79A7')
  ) +
  theme(
    axis.title = element_text(size = 13),
    axis.text = element_text(size = 12),
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    panel.grid = element_blank()) +
  guides(color = 'none')

metaT_shape_legend = get_legend(metaT_shape_legend)



umap_final_joint_plot = plot_grid(
  metaG_umap_result, NULL, metaT_umap_result, metaT_shape_legend,
  nrow = 1,
  rel_widths = c(1, 0.05, 1, 1),
  labels = c('a', NA, 'b')
)
umap_final_joint_plot
## Warning: Removed 1 rows containing missing values (`geom_text()`).


Figure 4. Distribution of core and additional biosynthetic genes or domains and transcripts from biosynthetic gene clusters

final_domain_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/final_domain_summary.rds')
smc_palette = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/smc_palette.rds')
final_de_and_clust_data = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/final_de_and_clust_data.rds')

# create plot showing the most frequently occurring core/auxiliary biosynthetic genes in the Cariao MAGs
all_common_domains_plot =
  final_domain_summary %>%
  group_by(description) %>%
  add_tally() %>%
  ungroup() %>%
  filter(!is.na(product)) %>%
  mutate(product = if_else(product == 'PKS', 'Polyketide', product)) %>% 
  mutate(description = case_when(
    str_detect(description,
               regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
    str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
    str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
    str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
    str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
    str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
    str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
    str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
    str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
    str_detect(description, regex(
      'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
      'FAD/FMN-dependent oxidoreductase',
    str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',

    TRUE ~ description)) %>% #view()
  filter(n > 0.2 * max(n) |
           description == 'FAD/FMN-dependent oxidoreductase'|
           description == 'FAD/FMN binding domain') %>% #pull(description) %>% unique()
  mutate(group = 'Core and additional biosynthetic genes') %>% #pull(description) %>% tabyl() %>% view()
  select(c(description, product, group)) %>%
  group_by(description, product, group) %>%
  mutate(number = n()) %>%
  ungroup() %>%
  group_by(description) %>%
  add_tally() %>%
  ungroup() %>%
  distinct() %>%
  arrange(desc(n)) %>%
  filter(description != 'ABC Transporter') %>%

  ggplot(aes(
    y = fct_inorder(description),
    x = number,
    fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
  geom_bar(stat = 'identity', position = 'stack') +
  theme_bw() +
  facet_wrap(~ group) +
  theme_bw() +
  theme(
    strip.text = element_text(size = 11),
    axis.text = element_text(size = 9),
    axis.title.y = element_blank(),
    strip.background = element_rect(fill = "wheat1"),
    panel.grid = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = c(0.9875, 0.98),
    legend.justification = c(1, 0.99),
    legend.background = element_blank(),
    panel.border = element_rect(color = 'black', fill = NA),
    legend.box.background = element_rect(color = 'black'),
    legend.title = element_text(size = 11),
    legend.text = element_text(size = 10)
    #legend.position = 'none'
  ) +
  scale_fill_manual(values = c(smc_palette), na.value = 'white') +
  labs(
    fill = 'Secondary metabolite class',
    y = 'Antibiotic resistance type',
    x = '\nTotal genes\n'
  ) +
  guides(fill = guide_legend(ncol = 2))




# # create plot showing the most frequently occurring core/auxiliary biosynthetic transcripts in the PA metatranscriptomes
pa_domains_plot =
  final_de_and_clust_data %>%
  filter(gene_name %in% final_domain_summary$gene_name) %>%
  filter(is_sig_pa_biosynthetic | (pa_counts > 0 & fl_counts == 0)) %>%
  rename(raw_product = product) %>%
  left_join(final_domain_summary %>%
              select(c(mag_name, contig, gene_name, product, description)),
            by = c('contig', 'mag_name', 'gene_name')) %>%
  mutate(product = case_when(
    str_detect(product, regex('hybrid', ignore_case = FALSE)) ~ 'Hybrid cluster',
    product == 'Other' ~ 'Other cluster*',
    product == 'PKS' ~ 'Polyketide',
    TRUE ~ product)) %>%
  select(-n) %>%
  filter(!is.na(product)) %>%
  mutate(description = case_when(
    str_detect(description,
               regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
    str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
    str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
    str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
    str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
    str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
    str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
    str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
    str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
    str_detect(description, regex(
      'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
      'FAD/FMN-dependent oxidoreductase',
    str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',

    TRUE ~ description)) %>%
  filter(description %in% all_common_domains_plot$data$description) %>%
  mutate(group = 'Particle-associated') %>% #pull(description) %>% tabyl() %>% view()
  select(c(description, product, group)) %>%
  group_by(description, product, group) %>%
  mutate(number = n()) %>%
  ungroup() %>%
  group_by(description) %>%
  add_tally() %>%
  ungroup() %>%
  distinct() %>%
  arrange(desc(n)) %>%
  filter(description != 'ABC Transporter') %>%

  ggplot(aes(
    y = factor(description,
               levels = unique(all_common_domains_plot$data$description)),
    x = number,
    fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
  geom_bar(stat = 'identity', position = 'stack') +
  theme_bw() +
  facet_wrap(~ group) +
  theme_bw() +
  theme(
    strip.text = element_text(size = 11),
    axis.text.x = element_text(size = 9),
    axis.text.y = element_blank(),
    axis.title.y = element_blank(),
    strip.background = element_rect(fill = "wheat1"),
    panel.grid = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = 'none') +
  scale_fill_manual(values = c(smc_palette), na.value = 'white') +
  labs(
    fill = 'Secondary metabolite class',
    y = 'Antibiotic resistance type',
    x = '\nTotal transcripts') +
  scale_x_continuous(limits = c(0, 200))




# # create plot showing the most frequently occurring core/auxiliary biosynthetic transcripts in the FL metatranscriptomes
fl_domains_plot =
  final_de_and_clust_data %>%
  filter(gene_name %in% final_domain_summary$gene_name) %>%
  filter(is_sig_fl_biosynthetic | (fl_counts > 0 & pa_counts == 0)) %>%
  rename(raw_product = product) %>%
  left_join(final_domain_summary %>%
              select(c(mag_name, contig, gene_name, product, description)),
            by = c('contig', 'mag_name', 'gene_name')) %>%
  mutate(product = case_when(
    str_detect(product, regex('hybrid', ignore_case = FALSE)) ~ 'Hybrid cluster',
    product == 'Other' ~ 'Other cluster*',
    product == 'PKS' ~ 'Polyketide',
    TRUE ~ product)) %>%
  select(-n) %>%
  filter(!is.na(product)) %>%
  mutate(description = case_when(
    str_detect(description,
               regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
    str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
    str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
    str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
    str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
    str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
    str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
    str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
    str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
    str_detect(description, regex(
      'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
      'FAD/FMN-dependent oxidoreductase',
    str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',

    TRUE ~ description)) %>%
  filter(description %in% all_common_domains_plot$data$description) %>%
  mutate(group = 'Free-living') %>% #pull(description) %>% tabyl() %>% view()
  select(c(description, product, group)) %>%
  group_by(description, product, group) %>%
  mutate(number = n()) %>%
  ungroup() %>%
  group_by(description) %>%
  add_tally() %>%
  ungroup() %>%
  distinct() %>%
  arrange(desc(n)) %>%
  filter(description != 'ABC Transporter') %>%

  ggplot(aes(
    y = factor(description,
               levels = unique(all_common_domains_plot$data$description)),
    x = number,
    fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
  geom_bar(stat = 'identity', position = 'stack') +
  theme_bw() +
  facet_wrap(~ group) +
  theme_bw() +
  theme(
    strip.text = element_text(size = 11),
    axis.text = element_text(size = 9),
    axis.title.y = element_blank(),
    strip.background = element_rect(fill = "wheat1"),
    panel.grid = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = 'none') +
  scale_fill_manual(values = c(smc_palette), na.value = 'white') +
  labs(
    fill = 'Secondary metabolite class',
    y = 'Antibiotic resistance type',
    x = '\nTotal transcripts') +
  scale_x_reverse(limits = c(200, 0))



# combine the PA and FL transcript expression bar plots
final_domains_plotgrid = plot_grid(
  fl_domains_plot,
  pa_domains_plot,
  nrow = 1,
  rel_widths = c(0.65, 0.35)
)


# combine the expression barplots with the genomic potential bar plot to create the final plot
FINAL_ALL_DOMAINS_PLOT =
  plot_grid(all_common_domains_plot,
            final_domains_plotgrid,
            ncol = 1,
            labels = 'auto',
            rel_heights = c(0.51, 0.49))

FINAL_ALL_DOMAINS_PLOT


Supplementary Figure 1. Frequency of Cariaco prokaryotic MAGs (≥75% completeness,

≤5% contamination) by bacterial (a) and archaeal phylum (b)

# Distribution of MAG phyla recovered and average genome completeness per phylum --------------------------------
sm_key = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/sm_key.rds')
mag_table = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/mag_table.rds')

heat_colors <- c(colorRampPalette(colors = 'white')(1),
                 colorRampPalette(colors = c('#ABD9E9',
                                             '#FEE090',
                                             'orange',
                                             '#D73027'))(8000))

sm_summary <- sm_key %>%
  select(mag_name, cluster, sm_class) %>%
  distinct() %>%
  left_join(mag_table, by = 'mag_name')

# plot used
mag_dist_plot_archaea <- mag_table %>%
  group_by(phylum) %>%
  summarize(freq = n()) %>%
  arrange(desc(freq)) %>%
  mutate(x = row_number(),
         phylum) %>%
  left_join(mag_table %>%
              select(phylum, domain, completeness) %>%
              group_by(phylum) %>%
              add_tally(mean(completeness), name = 'average_completeness') %>%
              select(-completeness) %>%
              ungroup() %>%
              distinct(),
            by = 'phylum') %>%
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum)) %>%
  mutate(phylum = as_factor(phylum)) %>%
  filter(domain == 'Archaea') %>%
  ggplot(aes(phylum, freq)) +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text.x = element_text(angle = 65, hjust = 1, size = 8),
        axis.title.x = element_text(size = 10),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 10),
        strip.background = element_rect(fill = 'wheat1'),
        axis.text.y = element_text(size = 8),
        legend.position = 'none'
  ) +
  geom_segment(aes(xend = phylum,  yend = 0), color = 'grey50') +
  geom_point(size = 1.5,
             aes(color = average_completeness)) +
  geom_text(aes(label = freq, vjust = -0.65), size = 2.5) +
  facet_grid(~ domain, scales = 'free', space = 'free') +
  labs(x = 'Phylum',
       y = 'Number of MAGs per phylum\n',
       color = 'Average genome
completeness per phylum',

  ) +
  scale_color_gradientn(colors = heat_colors,
                        breaks = seq(75, 100, by = 5),
                        limits = c(79, 100)
  ) +
  scale_y_continuous(limits = c(0, 75),
                     expand = expansion(add = c(1, 0)))


mag_dist_plot_bacteria =
  mag_table %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%
  group_by(phylum) %>%
  summarize(freq = n()) %>%
  arrange(desc(freq)) %>%
  mutate(x = row_number(),
         phylum) %>%
  left_join(mag_table %>%
              select(phylum, domain, completeness) %>%
              group_by(phylum) %>%
              add_tally(mean(completeness), name = 'average_completeness') %>%
              select(-completeness) %>%
              ungroup() %>%
              distinct(),
            by = 'phylum') %>%
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum)) %>%
  mutate(phylum = as_factor(phylum)) %>%
  filter(domain == 'Bacteria') %>%
  ggplot(aes(phylum, freq)) +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text.x = element_text(angle = 65, hjust = 1, size = 8),
        axis.title = element_text(size = 10),
        strip.text = element_text(size = 10),
        strip.background = element_rect(fill = 'wheat1'),
        legend.title = element_text(size = 8),
        legend.direction = 'horizontal',
        legend.margin = margin(c(4,15,4,4)),
        legend.text = element_text(size = 8),
        axis.text.y = element_text(size = 8),
        legend.position = c(0.71, 0.87),
        legend.background = element_rect(fill = "white",
                                         color = "black")
  ) +
  geom_segment(aes(xend = phylum,  yend = 0), color = 'grey50') +
  geom_point(size = 1.5,
             aes(color = average_completeness)) +
  geom_text(aes(label = freq, vjust = -0.65), size = 2.5) +
  facet_grid(~ domain, scales = 'free', space = 'free') +
  labs(x = 'Phylum',
       y = 'Number of MAGs per phylum\n',
       color = 'Average genome
completeness per phylum',
  ) +
  scale_color_gradientn(colors = heat_colors,
                        breaks = seq(75, 100, by = 5),
                        limits = c(79, 100)
  ) +
  scale_y_continuous(limits = c(0, 75),
                     expand = expansion(add = c(1, 0)))

 
mag_dist_plot = plot_grid(mag_dist_plot_bacteria, mag_dist_plot_archaea,
                          labels = c('a', 'b'),
                          rel_widths = c(4, 1),
                          nrow = 1,
                          label_size = 11.5,
                          align = 'hv',
                          label_x = c(0.045, 0.205))

mag_dist_plot


Supplementary Figure 2. MAG relative abundances

Process the CoverM relative abundance output files, plot relative abundance of the most abundant MAGs

mag_table = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/mag_table.rds')

final_genome_rel_abun_tbl = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/rel_abun_tibble_48_samples.rds')

# create sample names key
metaG_sampleNames_key.X = tibble(
  sample_name = colnames(final_genome_rel_abun_tbl)[2:ncol(final_genome_rel_abun_tbl)],
  depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  fraction = if_else(str_detect(sample_name, 'A'), 'PA', 'FL'),
  season = case_when(
    str_detect(sample_name, 'AM|BM') ~ 'M',
    str_detect(sample_name, 'AN|BN') ~ 'N',
    depth %in% c(103,198,234,295,314) ~ 'M',
    depth %in% c(143,200,237,247,267) ~ 'N'),
  replicate = if_else(str_detect(sample_name, 'a'), 1, 2),
  new_sample_name = paste0(season, fraction, depth, '_', replicate)
)

all(colnames(final_genome_rel_abun_tbl)[2:ncol(final_genome_rel_abun_tbl)] == metaG_sampleNames_key.X$sample_name)
## [1] TRUE
#TRUE

# color palette listing the color choices for all annotation rows/columns on the heatmap
relPalette <- list(Layer = c(
  'Oxycline' = 'darkslategray3',
  'Shallow anoxic' = 'gold',
  'Euxinic' =  'sienna3'),

  Fraction = c('PA' = 'darkgreen',
               'FL' = 'darkseagreen2'),

  Season = c('May' = 'slategray',
             'November' = 'slategray1'),

                   `Depth (m)` = c(
                     '103' = 'grey90',
                     '143' = 'grey85',
                     '198' = 'grey80',
                     '200' = 'grey75',
                     '234' = 'grey70',
                     '237' = 'grey65',
                     '247' = 'grey60',
                     '267' = 'grey55',
                     '295' = 'grey50',
                     '314' = 'grey45',
                     '900' = 'grey20'
                   ),

                   `Phylum/Class` = c('Planctomycetota' = 'wheat1',
                                      'Desulfobacterota' = 'tomato',
                                      'Myxococcota' = 'orange',
                                      'Alphaproteobacteria' = 'orchid1',
                                      'Gammaproteobacteria' = 'thistle1',
                                      'Chloroflexota' = 'slateblue1',
                                      'Marinisomatota' = 'aquamarine',
                                      'Aenigmarchaeota' = 'gold',
                                      'Krumholzibacteriota' = 'darkred',
                                      'Omnitrophota' = 'darkviolet',
                                      'Verrucomicrobiota' = 'khaki',
                                      'Thermoplasmatota' = 'firebrick1',
                                      'AABM5-125-24' = 'gainsboro',
                                      'Bacteroidota' = 'springgreen',
                                      'SAR324' = 'chocolate4',
                                      'Cloacimonadota' = 'grey28',
                                      'Crenarchaeota' = 'yellowgreen',
                                      'Acidobacteriota' = 'deepskyblue',
                                      'Actinobacteriota' = '#666666',
                                      'Eisenbacteria' = '#B89B74',
                                      'Gemmatimonadota' = 'lightsteelblue1',
                                      'Altiarchaeota' = 'ivory',
                                      'Nitrospinota' = '#2A7FB7',
                                      'Fermentibacterota' = '#1B9E77',
                                      'Latescibacterota' = '#AD4C9E',
                                      'Patescibacteria' = 'dodgerblue4',
                                      'Undinarchaeota' = 'turquoise',
                                      'Unclassified' = '#5A8950'
                   ))

mag_table = mag_table %>%
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum))


final_genome_rel_abun_tbl = 
  final_genome_rel_abun_tbl %>%
  rename(mag_name = Genome) %>%
  filter(!mag_name %in% c('unmapped', paste0('CarAnox_mtb2_', c(1507, 983, 769)))) %>%
  mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2', 'MAG')) %>%
  left_join(
    mag_table %>%
      select(mag_name, tax_num),
    by = 'mag_name'
  ) %>% 
  relocate(tax_num, .after = 1) %>%
  arrange(tax_num) %>%
  select(-mag_name) %>%
  column_to_rownames(var = 'tax_num') %>%
  mutate(across(1:last_col(), ~ log1p(.x)), # log-transform relative abundance percentages 
         row_sums = rowSums(.[1:ncol(.)])) %>%
  filter(row_sums > 0.005 * max(row_sums)) %>% # keep only the most abundant MAGs across the samples
  select(-row_sums) %>%
  rename_with(
    ~ metaG_sampleNames_key.X$new_sample_name,
    .cols = 1:last_col()) %>%
  relocate(
    metaG_sampleNames_key.X %>%
      arrange(desc(fraction), depth) %>%
      pull(new_sample_name),
    .after = 1)

# create character vector of phylums of the most abundant MAGs, sorted by the most to the least frequently appearing phylum
# split Proteobacteria in Alpha- and Gammaproteobacteria classes
phylum_order =
  final_genome_rel_abun_tbl %>%
  mutate(row_sums = rowSums(.),
         tax_num = rownames(.)) %>%
  left_join(
    mag_table %>%
      select(tax_num, phylum, class),
    by = 'tax_num') %>%
  select(c(row_sums, tax_num, phylum, class)) %>%
  mutate(phylum = if_else(phylum == 'Proteobacteria', class, phylum)) %>%
  select(-class) %>%
  group_by(phylum) %>%
  mutate(group_sum = sum(row_sums)) %>%
  ungroup() %>%
  arrange(desc(group_sum), phylum) %>%
  pull(phylum) %>%
  unique()

#change UAP2
phylum_order[28] = 'Undinarchaeota'

# create final relative abundance matrix to be used for plotting
final_genome_rel_abun_tbl =
  final_genome_rel_abun_tbl %>%
  mutate(row_sums = rowSums(.),
         tax_num = rownames(.)) %>%
  left_join(
    mag_table %>%
      select(tax_num, phylum, class),
    by = 'tax_num') %>%
  mutate(phylum = if_else(
    phylum == 'Proteobacteria', class, phylum)) %>%
  select(-class) %>%
  group_by(phylum) %>%
  mutate(group_sum = sum(row_sums)) %>%
  ungroup() %>%
  arrange(desc(group_sum), phylum) %>%
  select(-c(row_sums, group_sum, phylum)) %>%
  column_to_rownames(var = 'tax_num') %>%
  relocate( # function rearrange the columns by season, and water layer - incrementally increasing in depth
    colnames(.) %>%
      {function(x) {
        tbl = tibble(
          mpa = x[str_detect(x, 'MPA')],
          npa = sort(c('NPA143_2', x[str_detect(x, 'NPA')])),
          mfl = x[str_detect(x, 'MFL')],
          nfl = x[str_detect(x, 'NFL')]
        ) %>%
          mutate(groupper = rep(paste0('group_',
                                       seq(1, 6, by = 1)),
                                each = 2)) %>%
          group_by(groupper) %>%
          summarize(across(everything(), ~
                             paste0(.x, collapse = ';')),
                    .groups = 'drop') %>%
          select(-groupper)

        pa = as.vector(rbind(tbl$mpa, tbl$npa))
        fl = as.vector(rbind(tbl$mfl, tbl$nfl))

        g = c(pa, fl)
        g = str_split(g, pattern = ';') %>% unlist()
        g = g[g != 'NPA143_2']

        return(g)}}()
    ) %>%
  as.matrix()


# row metadata for relative abundance plot - the phylum/class of each MAG
row_anno =
  final_genome_rel_abun_tbl %>%
  as.data.frame() %>%
  rownames_to_column(var = 'tax_num') %>%
  select(tax_num) %>%
  left_join(
    mag_table %>%
      select(tax_num, phylum, class),
    by = 'tax_num') %>%
  as_tibble() %>%
  mutate(phylum = if_else(
    phylum == 'Proteobacteria', class, phylum)) %>%
  select(-class) %>%
  mutate(phylum = if_else(phylum == 'UAP2', 'Undinarchaeota', phylum)) %>% 
  rename(`Phylum/Class` = phylum) %>%
  arrange(factor(`Phylum/Class`, levels = phylum_order)) %>%
  column_to_rownames(var = 'tax_num')


# sort the Phylum/Class color palette by frequency
relPalette$`Phylum/Class` = relPalette$`Phylum/Class`[unique(row_anno$`Phylum/Class`)]


# column metadata for the relative abundance heatmap; annotation codings for sample fraction, sampling season, water layer, and water depth
col_anno = tibble(
  cols = colnames(final_genome_rel_abun_tbl),
  Fraction = str_replace(cols, '[A-Z]{1}([A-Z]{2})\\d{3}.*', '\\1'),
  Season = if_else(str_detect(cols, 'M'), 'May', 'November'),
  Layer = str_replace(cols, '[A-Z]+(\\d+)_.*', '\\1')
) %>%
  mutate(Layer = as.integer(Layer),
         Layer = case_when(
           Layer < 247 ~ 'Oxycline',
           Layer >= 247 & Layer < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(`Depth (m)` = str_replace(
    cols, '.*(\\d{3}).*', '\\1'
  )) %>%
  column_to_rownames(var = 'cols') %>%
  relocate(`Depth (m)`, Fraction, Season, Layer)

# create the heatmap
final_genome_rel_abun_plot = final_genome_rel_abun_tbl %>%
  pheatmap::pheatmap(
    name = 'Relative abundance (% total reads)',
    angle_col = '315',
    fontsize = 8,
    annotation_row = row_anno,
    annotation_col = col_anno,
    annotation_colors = relPalette,
    show_colnames = FALSE,
    gaps_col = 23,
    clustering_method = 'ward.D',
    cluster_rows = FALSE,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    treeheight_row = 0,
    treeheight_col = 0,
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)),
    legend_breaks = seq(0, 3.5, by = 0.5),
    legend_labels = ComplexHeatmap::gt_render(c(
      '0%   ',
      '0.64%   ',
      '1.72%   ',
      '3.48%   ',
      '6.39%   ',
      '11.18%   ',
      '19.09%   ',
      '32.12%   '))
  )

final_genome_rel_abun_plot


Supplementary Figure 3. MAG differential abundance (fraction preference)

Load required libraries, create matrix of counts. Each column will represent a metagenomic sample and rows will correspond to metagenome-assembled genomes (MAGs). Each cell will contain the number of reads that mapped to the contigs of a MAG.

library(furrr) # library of purrr-style iterators that utilize multiple cores for parallel processing
library(DESeq2) # library for running differential abundance analysis
library(BiocParallel) # for DESeq2 parallel processing


# if using a multicore machine, example furrr parallel processing parameters are below:  
plan(multicore, workers = 35) # 35 cores allotted to furrr's multicore iterators
options(future.globals.maxSize = 5242880000) #5000 * 1024^2; 5Gb memory allotted to each core

# set directory to where the coverM files are located
setwd('~/path/to/github_data/coverM/dna/final_counts_of_reads_mapped_to_genomes/')

files = system('ls *.tsv', intern = TRUE) 
sample_names = str_replace(files, '_counts_reads_mapped_per_genome.tsv', '')

# Create a list of read-mapping results (in tibble format); mapped reads to the MAGs (all contigs)
# Note: furrr's iterators are implemented like purrr's with the added 'future_' prepended to the function name; it will call the iterations in parallel across multiple cores
mapped_read_counts = future_map2( 
  files,
  sample_names, ~
    vroom::vroom(
      file = .x,
      delim = '\t',
      col_names = c('mag_name', .y),
      skip = 1,
      show_col_types = FALSE
    )
)

# Join all read-mapping results together into a tibble by the column of MAG names in each list component
mapped_read_counts =
  reduce(mapped_read_counts, left_join, by = 'mag_name')


Load additional data related to the differential abundance analysis, prep the DESeq2 object and run DESeq2

# Load a tibble with taxonomic and assembly information for each MAG
#mag_table <- readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/coverM/deseq2_dnaseq_rdata/draftMag-taxonomic-table-cariaco.RDA')

mag_table = read_csv('rdata-files/mags_75compl_5cont.csv', show_col_types = FALSE)

taxonomic_ranks = c('domain', 'phylum', 'class', 'order', 'family', 'genus', 'species')

mag_table = mag_table |>
  rename(mag_name = bin_id) |>
  mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2-', 'MAG_')) |>
  select(mag_name, classification, completeness, contamination, strain_het) |>
  separate(classification, into = taxonomic_ranks, sep = ';') |>
  mutate(across(all_of(taxonomic_ranks), ~ str_replace(.x, '[:alpha:]__', ''))) |>
  mutate(across(all_of(taxonomic_ranks), ~ if_else(.x == '', NA_character_, .x))) |>
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum),
         tax_num = str_replace(mag_name, 'MAG', phylum), .after = mag_name)


# Create metadata tibble for the metagenomic samples
sampleNames_key <- tibble(
  sample_name = colnames(mapped_read_counts)[2:ncol(mapped_read_counts)]) %>%
  mutate(depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
         season = case_when(
           (depth == 103 | depth == 198 | depth == 234 | depth == 295 | depth == 314) ~ 'M',
           (depth == 143 | depth == 200 | depth == 237 | depth == 247 | depth == 267) ~ 'N',
                            str_detect(sample_name, 'M') ~ 'M',
                            str_detect(sample_name, 'N') ~ 'N'),
         fraction = if_else(str_detect(sample_name, 'A'), 'PA', 'FL'),
         replicate = if_else(str_detect(sample_name, 'a'), 1, 2),
         new_column_names = paste0(season, fraction, depth, '_', replicate)) %>%
  arrange(factor(sample_name, levels = colnames(select(mapped_read_counts, -mag_name))))

all(sampleNames_key$sample_name == colnames(select(mapped_read_counts, -mag_name))) #TRUE

# Create sample metadata for use in the DESeq() function. This analysis will look for differential abundance patterns between fractions in three different chemically distinct water layers: the oxycline, shallow anoxic, and euxinic depths
coldata <- sampleNames_key %>%
  select(depth, fraction, new_column_names) %>%
  mutate(layer = case_when(depth == 900 ~ 'euxinic',
                           depth >= 103 & depth <= 237 ~ 'oxycline',
                           depth > 237 & depth < 900 ~ 'shallow.anoxic')) %>%
  select(-depth) %>%
  dplyr::rename(sample = new_column_names) %>%
  unite(group, c(fraction, layer), sep = '_') %>%
  relocate(sample) %>%
  mutate(group = as_factor(group))

# Rename the mapped-read counts tibble with metagenomic sample naming scheme used in the paper
mapped_read_counts = mapped_read_counts %>%
  rename_with(
    ~ sampleNames_key$new_column_names,
    .cols = 2:last_col()) %>%
  column_to_rownames(var = 'mag_name')

# Create the DESeq2 analysis object
da_dds <- DESeq2::DESeqDataSetFromMatrix(countData = mapped_read_counts,
                                         colData = coldata,
                                         design = ~ group)

# Run the DESeq2 statistical model on the data; assumes the data have a negative binomial distribution
da_final_dds <- DESeq(da_dds) 


Extract the results from the oxycline, shallow anoxic, and euxinic depths that compare PA and FL fraction types

# pull Wald test results comparing the oxycline PA to FL fractions
oxycline <- results(da_final_dds, contrast = c('group',
                                               'PA_oxycline', 'FL_oxycline'), alpha = 0.05)

# pull Wald test results comparing the euxinic PA to FL fractions
euxinic <- results(da_final_dds, contrast = c('group',
                                              'PA_euxinic', 'FL_euxinic'), alpha = 0.05)

# pull Wald test results comparing the shallow anoxic PA to FL fractions
shallow_anoxic <- results(da_final_dds, contrast = c('group', 'PA_shallow.anoxic',
                                                     'FL_shallow.anoxic'), alpha = 0.05)

# create a comprehensive tibble including all differential abundance results
# classify MAG fraction "preference" based on the direction of log2FoldChange and the adjusted p-values
allDaResults <- list(oxycline = oxycline,
                     shallow_anoxic = shallow_anoxic,
                     euxinic = euxinic) %>%
  imap(~ .x %>%
         as.data.frame() %>%
         rownames_to_column(var = 'mag_name') %>%
         mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2', 'MAG')) %>%
         as_tibble() %>%
         mutate(preference = case_when(log2FoldChange > 0 & padj < 0.05 ~ 'particle enriched',
                                       log2FoldChange < 0 & padj < 0.05 ~ 'free-living enriched',
                                       padj > 0.05 | is.na(padj) ~ 'no significant difference'),
                layer = .y) %>%
         left_join(mag_table %>% select(mag_name, phylum), by = 'mag_name') %>%
         mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum),
                preference = factor(preference,
                                    levels = c(
                                      'particle enriched',
                                      'free-living enriched',
                                      'no significant difference'))))

# save the differential abundance final results tibble
saveRDS(allDaResults, file = '~/path/to/github_data/coverM/dna/rdata/deseq2_dnaseq_rdata/deseq2_results_by_water_layer.rds')


Plot the differential abundance results

library(tidyverse) # loads several useful data-wrangling and plotting libraries
library(gridtext) # for editing text in heatmap plots
library(ComplexHeatmap) # for heatmap plots
library(pheatmap) # for heatmap plots
library(cowplot) # for combining various plots together in grids
library(ggtext) # used for ggplots
library(glue) # used for ggplots


#locally load DESeq2 results
allDaResults = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/deseq2_results_by_water_layer.rds')

## plotting
magFractionPref <- allDaResults %>%
  map_dfr(~
            .x %>%
              mutate(layer = case_when(
                layer == 'oxycline' ~ 'Oxycline',
                layer == 'euxinic' ~ 'Euxinic',
                layer == 'shallow_anoxic' ~ 'Shallow anoxic',
              ))
          )

magFractionPref_plot = magFractionPref %>%
  ggplot(aes(y = fct_rev(fct_infreq(phylum)),
             fill = fct_relevel(preference, c('no significant difference',
                                              'free-living enriched',
                                              'particle enriched')))) +
  geom_bar() +
  labs(x = '\nNumber of genomes', y = 'Phylum\n', fill = 'Fraction preference') +
  theme_bw() +
  scale_fill_manual(values = c('grey80', 'palegreen3', 'burlywood2')) +
  facet_wrap(~ fct_relevel(layer, c('Oxycline', 'Shallow anoxic', 'Euxinic'))) +
  theme(
    axis.title = element_text(size = 10),
    strip.text = element_text(size = 10),
    strip.background = element_rect(fill = 'wheat1'),
    panel.grid = element_blank(),
    legend.position = 'top',
    legend.justification = c(0.92,0)
  )
magFractionPref_plot


Supplementary Figure 4. Distribution of biosynthetic gene clusters identified using antiSMASH 6.0

bgc_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/bgc_summary.rds')

bgc_10k_kb = bgc_summary %>%
  mutate(size = end - start, .after = contig) %>%
  filter(size >= 10000)

bgc_count_data = bgc_10k_kb %>%
  mutate(mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1'), .before = contig) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% # remove MAGs that are from laboratory contamination
  select(-mag_name) %>%
  mutate(clust = paste0('clust_', 1:n()), .before = size) %>%
  group_by(contig) %>%
  group_split() %>%
  map_dfr(~ .x %>%
            mutate(ID.match = map2(
              1:length(start), list(.), ~
                .y %>%
                filter(
                  start > start[.x] & start < end[.x] | # start of the BGC is greater than your start, and less than your end
                    start < start[.x] & end > start[.x] | # start of the BGC is less than your start, and end is greater than your start
                    start == start[.x] & start < end[.x]) %>% # start of the BGC is equal to your start, and less than your end
                pull(clust)),
            .before = start)) %>%
  group_by(contig) %>%
  mutate(concat = list(unlist(ID.match)), .after = ID.match) %>%
  mutate(concat = map(concat, ~ unique(.x[duplicated(.x)]))) %>%
  mutate(length_concat = length(concat), .after = concat) %>%
  mutate(n = n(), .after = length_concat) %>%
  ungroup()


all(bgc_count_data$length_concat == bgc_count_data$n) # TRUE; all contigs w/ multiple BGCs have some level of overlap amongst them; can classify these all as 'hybrid clusters' of various sorts
## [1] TRUE
bgc_count_data = bgc_count_data %>%
  select(-c(ID.match, concat, length_concat)) %>%
  group_by(contig) %>%
  mutate(is_hybrid = if_else(n() > 1, TRUE, FALSE), .after = contig)


bgc_count_data_summary = bgc_count_data %>%
  ungroup() %>%
  select(-detection_rule) %>%
  mutate(product = str_replace(product, '-like', ''),
         product = str_replace(product, '.*PKS', 'PKS')) %>%
  arrange(contig, product) %>%
  group_by(contig, is_hybrid) %>%
  select(-c(filename, filepath)) %>%
  summarize(across(everything(), ~ paste0(.x, collapse = ';')), .groups = 'drop') %>%
  mutate(product = if_else(
    str_detect(product, ';'), str_replace(product, '(.*)', '\\1 hybrid'), product),
    product = str_replace_all(product, ';', '-')
  )

bgc_count_data_summary = bgc_count_data_summary %>%
  mutate(
    product = case_when(
      !is_hybrid & str_detect(product, 'lanthipeptide|RRE-containing|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides') ~ 'RiPP',
      !is_hybrid & str_detect(product, 'lactone') ~ 'Lactone',
      !is_hybrid & str_detect(product, 'acyl_amino_acids|nucleoside|PBDE|indole|siderophore|resorcinol|ectoine|NAPAA|CDPS|LAP|redox-cofactor') ~ 'other',
      !is_hybrid & str_detect(product, 'hglE-KS') ~ 'PKS',
      !is_hybrid & str_detect(product, 'arylpolyene') ~ 'Aryl polyene',
      !is_hybrid & str_detect(product, 'redox-cofactor') ~ 'Redox-cofactor',
      #
      is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-PKS hybrid',
      is_hybrid & str_detect(product, 'NRPS') & !str_detect(product, 'PKS|hglE-KS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'NRPS hybrid',
      is_hybrid & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|NRPS|hglE-KS') ~ 'RiPP hybrid',
      is_hybrid & str_detect(product, 'PKS|hglE-KS') & !str_detect(product, 'NRPS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'PKS hybrid',
      is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-RiPP hybrid',
      is_hybrid & str_detect(product, 'PKS|hglE-KS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'NRPS') ~ 'PKS-RiPP hybrid',
      is_hybrid & str_detect(product, 'terpene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS') ~ 'Terpene hybrid',
      is_hybrid & str_detect(product, 'ladderane') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Ladderane hybrid',
      is_hybrid & str_detect(product, 'arylpolyene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Aryl polyene hybrid',
      is_hybrid & str_detect(product, 'lactone') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Lactone hybrid',
      is_hybrid & str_detect(product, 'CDPS') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Other hybrid',
      #
      TRUE ~ product
    )) %>%
  mutate(product = if_else(str_detect(product, '^[:lower:]'), str_to_title(product), product))


bgc_count_data_summary = bgc_count_data_summary %>%
  mutate(mag_name = str_replace(contig, '(MAG_\\d+)_\\d+', '\\1'), .before = contig) %>% #create mag name col
  left_join(mag_table %>% select(mag_name, domain, phylum), by = 'mag_name') %>% # add phylum, domain cols
  relocate(c(domain, phylum), .after = 1) %>% # relocate cols
  select(c(phylum, domain, product)) %>% # select some cols
  mutate(n_product_in_phylum = 1,
         product = if_else(str_detect(product, 'hybrid'), 'Hybrid cluster', product)) %>% # edit product col, add sum col, sm class
  group_by(phylum, domain, product) %>%
  summarize(n_product_in_phylum = sum(n_product_in_phylum), .groups = 'drop') %>% # sum up each type of bgc in each phylum
  left_join(mag_table %>%
              select(phylum) %>%
              group_by(phylum) %>%
              summarize(n_mags_in_phylum = n()),
            by = 'phylum') %>% # join col that says how many MAGs are in each phylum
  mutate(norm_sm_freq = n_product_in_phylum / n_mags_in_phylum, # divide #bgc / # mag per phylum for each bgc per phylum
         product = if_else(product == 'Other', 'Other cluster*', product)) %>%
  #
  group_by(phylum) %>%
  add_tally(sum(norm_sm_freq), name = 'total_sm_freq') %>%
  ungroup() %>%
  arrange(desc(total_sm_freq), phylum) %>%
  mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum))


# Pie chart of Cariaco MAG BGCs groupped by class -------------------------

final_pie =
  bgc_count_data_summary %>%
  mutate(product = if_else(product == 'PKS', 'Polyketide', product)) %>%
  select(product, n_product_in_phylum) %>%
  group_by(product) %>%
  summarize(n_product_in_phylum = sum(n_product_in_phylum)) %>%
  left_join(
    tibble(
      product = names(smc_palette),
      color = unname(smc_palette)),
    by = 'product')


library(plotly) # load plotly for plotting
## 
## Attaching package: 'plotly'
## The following object is masked _by_ '.GlobalEnv':
## 
##     highlight
## The following object is masked from 'package:ComplexHeatmap':
## 
##     add_heatmap
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# create the pie chart and save it
final_bgc_pie = plot_ly(final_pie,
                  labels = ~product,
                  values = ~n_product_in_phylum,
                  type = 'pie',
                  textposition = 'inside',
                  textinfo = 'label+percent+value',
                  insidetextfont = list(color = '#FFFFFF'),
                  text = ~n_product_in_phylum,
                  marker = list(colors = ~color,
                                line = list(
                                  color = '#FFFFFF',
                                  width = 1)),
                  showlegend = FALSE) %>%
  layout(xaxis = list(
    showgrid = FALSE,
    zeroline = FALSE,
    showticklabels = FALSE),
    yaxis = list(
      showgrid = FALSE,
      zeroline = FALSE,
      showticklabels = FALSE))

final_bgc_pie


Processing of RNA-Seq read-mapping .paf files from Minimap2

Process the .paf output files

# load required libraries
library(tidyverse)
library(furrr)

#set parallel processing params
plan(multicore, workers = 48)
options(future.globals.maxSize = 52428800000)

# metadata file of gene names
gene_metadata = tibble(
  gene_name = readLines('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX/final_concat_gene_names.txt')
) %>%
  separate(gene_name,
           into = c('gene_name', 'start', 'end', 'strand', 'extra_data'),
           sep = ' # ')

gene_metadata = gene_metadata %>%
  mutate(gene_name = str_replace(gene_name, '^>', ''))

empty_gene_counts_filler = gene_metadata %>%
  select(gene_name) %>%
  mutate(n_reads_mapped = 0L)

rm(gene_metadata)

# set working directory to the one with the .paf output files
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used')

# create vector of .paf full file paths
file_paths = system('ls *.paf', intern = TRUE)
sample_names = str_replace(file_paths, '.paf', '')

#function to process the .paf files
prepare_read_mappings = function(.empty_gene_counts_filler, .file_path) {

  mapping_results = tibble(temp = readLines(.file_path))

  mapping_results[c('qname', 'qlen', 'qstart', 'qend',
                    'strand', 'tname', 'tlen', 'tstart',
                    'tend', 'nmatch', 'alen', 'mapq',
                    'align_type', 'X14', 'X15', 'X16', 'X17')] <- str_split_fixed(mapping_results$temp, '\t', 17)

  mapping_results = mapping_results %>%
    select(-temp) %>% # remove temp col containing whole line information
    select(1:13) %>% # select columns 1:13
    mutate(across(c(qlen, qstart, qend,
                    tlen, tstart, tend,
                    nmatch, alen), ~ as.integer(.x)))

  # filter to keep only primary alignment results
  mapping_results = mapping_results %>%
    filter(
      align_type == 'tp:A:P', # retain primary alignments; note: there are no secondary/suppl. alignments in any .paf files
      mapq != 255, # filter out any reads that did not map; note: there were no such scores in any .paf files
      alen >= 0.5 * qlen, # the alignment length is at least 50% of the read length
      nmatch >= 0.95 * alen) %>%  # at least 95% of the aligned bases are a match
    group_by(qname) %>%
    filter(mapq == max(mapq)) %>%  # for any read with multiple primary mappings, keep alignment with highest score
    slice_sample(n = 1) %>%
    ungroup()

  mapping_results = mapping_results %>%
    group_by(tname) %>% # create groupped tibble; groupped by target sequences (genes)
    summarize(n_reads_mapped = n()) # aggregate the tibble, summing up counts of mapped reads

  # bind the counts of mapped reads to genes with all the other genes, counted zero times
  mapping_results = mapping_results %>%
    rename(gene_name = tname) %>%
    bind_rows(
      filter(.empty_gene_counts_filler, !(gene_name %in% .$gene_name))
    )
  
  return(mapping_results)
}

# process .paf files using parallel processing
processed_mapping_counts = future_map2(
  .options = furrr_options(seed = 111),
  list(empty_gene_counts_filler),
  file_paths, ~
    prepare_read_mappings(.x, .y)
)

# save the list of processed files
saveRDS(processed_mapping_counts, file = 'rdata/list_of_mapped_metaT_reads_from_each_sample.rds')


Finish processing the read-mapping results, calculate normalized Transcript Per Million (TPM) values for the final dataset

# create tibble that has all metatranscriptome read-mapping results joined together
processed_mapping_counts =
  reduce(processed_mapping_counts, left_join, by = 'gene_name') %>% 
  rename_with( # rename the meteatranscriptome sample columns based on naming scheme used in the paper
    ~ sample_names,
    .cols = 2:last_col()) %>%
  mutate(gene_name = str_replace(
    gene_name, '(MAG_\\d+)_(\\d+_\\d+)', '\\1-ctg\\2'))

#read in gene lengths data
gl = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/gene_lengths_tibble.rda')

gl = gl %>%
  mutate(gene_length = gene_length / 1000) # convert unit length to kilobases

# join gene length data to read-mapping data
# divide each read-mapping count by the length of the gene that it mapped to
processed_mapping_counts = processed_mapping_counts %>%
  left_join(gl, by = 'gene_name') %>%
  relocate(gene_length, .after = 1) %>%
  mutate(across(3:last_col(), ~ .x / gene_length))

# function to convert RNA-seq counts to TPM; TPM is normalized across and within samples
# the counts tibble has already had each cell divided by gene length
# this function will calculate the TPM scaling factor for each column and divide each cell in a column by the scaling factor, for all columns
get_tpm_for = function(col) {
  scaling_factor = sum(col) / 1e6
  col = col / scaling_factor
  return(col)
}

# finalize the TPM normalization
processed_mapping_counts = processed_mapping_counts %>%
  select(-gene_length) %>%
  mutate(across(2:last_col(), ~ get_tpm_for(.x)))

# read in biosynthetic gene cluster (BGC) metadata, parsed from antiSMASH 6 output
bgc = vroom::vroom(
  '/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/salmon_related_data_files/bgc_genes_from_10kb_clusters.txt',
  show_col_types = FALSE)

bgc = bgc %>%
  mutate(gene_name = str_replace(
    gene_name,
    '(^ctg\\d+_\\d+)-(MAG.*)', '\\2-\\1'
  ))

# filter the TPM-normalized read-mapping data to retain only biosynthetic gene cluster
# transcript mappings from biosynthetic clusters that were at least 10 kilobases in size
processed_mapping_counts = processed_mapping_counts %>%
  filter(gene_name %in% bgc$gene_name)


Processing of DNA-Seq read-mapping .paf files from Minimap2

Process the .paf output files

# load required libraries
library(tidyverse)
library(furrr)

# set working directory to the one with the .paf output files
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaG_mapping_final_results_used')

#set parallel processing params
plan(multicore, workers = 19)
#(125000 * 1024 ^2) %>% clipr::write_clip()
options(future.globals.maxSize = 1.31072e+11) #125000 * 1024 ^2

empty_gene_counts_filler = readRDS('rdata/metaG_empty_gene_counts_filler.rds')

# create vector of .paf full file paths
file_paths = system('ls *.paf', intern = TRUE)
sample_names = str_replace(file_paths, '.paf', '')

#function to process the .paf files
prepare_read_mappings = function(.empty_gene_counts_filler, .file_path) {

  mapping_results = tibble(temp = readLines(.file_path))

  mapping_results[c('qname', 'qlen', 'qstart', 'qend',
                    'strand', 'tname', 'tlen', 'tstart',
                    'tend', 'nmatch', 'alen', 'mapq',
                    'align_type', 'X14', 'X15', 'X16', 'X17')] <- str_split_fixed(mapping_results$temp, '\t', 17)

  mapping_results = mapping_results |>
    select(-temp) |> # remove temp col containing whole line information
    select(1:13) |> # select columns 1:13
    mutate(across(c(qlen, qstart, qend,
                    tlen, tstart, tend,
                    nmatch, alen), ~ as.integer(.x)))

  # filter to keep only top primary alignment results
  mapping_results = mapping_results |>
    filter(
      align_type == 'tp:A:P', # keep primary alignments
      mapq != 255, # filter out any reads that did not map
      alen >= 0.5 * qlen, # the alignment length is at least 50% of the read length
      nmatch >= 0.95 * alen) |>  # at least 95% of the aligned bases are a match
    group_by(qname) |>
    filter(mapq == max(mapq)) |>  # for any read with multiple primary mappings, keep alignment with highest score
    slice_sample(n = 1) |>
    ungroup()

  mapping_results = mapping_results %>%
    group_by(tname) %>% # create groupped tibble; groupped by target sequences (genes)
    summarize(n_reads_mapped = n()) # aggregate the tibble, summing up counts of mapped reads

  # bind the counts of mapped reads to genes with all the other genes, counted zero times
  mapping_results = mapping_results %>%
    rename(gene_name = tname) %>%
    bind_rows(
      filter(.empty_gene_counts_filler, !(gene_name %in% .$gene_name))
    )

  return(mapping_results)
}

# process .paf files using parallel processing
processed_mapping_counts = future_map2(
  list(empty_gene_counts_filler),
  file_paths, ~
    prepare_read_mappings(.x, .y)
)

# save the list of processed files
saveRDS(processed_mapping_counts, file = 'rdata/list_of_mapped_metaG_reads_from_each_sample.rds')


Differential gene expression analysis

Prepare gene expression matrix for DESeq2 differential expression analysis

# deseq2 ALL PA DEPTHS VS ALL FL DEPTHS run on minimap2 counts files -- 95% min percent identity of alignments at least 50% of read length aligned  --------

# set working dir to the one containing .paf files from metatranscriptomic read-mapping to all cariaco MAG genes
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used/')

files = system('ls *.paf', intern = TRUE)
sample_names = str_replace(files, '.paf', '')

# load list of tibbles - each tibble containing the read-mapping counts to genes from each metatranscriptomic sample
mapped_read_counts = readRDS('rdata/list_of_mapped_metaT_reads_from_each_sample.rds')

# join the tibbles together
mapped_read_counts =
  reduce(mapped_read_counts, left_join, by = 'gene_name')

# rename samples to use sample names used in the paper
mapped_read_counts =
  mapped_read_counts %>%
  rename_with(
    ~ sample_names,
    .cols = 2:last_col()
  )

mapped_read_counts =
  mapped_read_counts %>%
  mutate(gene_name = str_replace(
    gene_name, '(MAG_\\d+)_(\\d+_\\d+)', '\\1-ctg\\2'))  %>%
  as.data.frame() %>%
  column_to_rownames(var = 'gene_name')

# create column metadata for use with the DESeq() function
col_data = tibble(sample = colnames(mapped_read_counts)) %>%
  mutate(fraction = if_else(str_detect(sample, 'PA'), 'PA', 'FL'),
         depth = str_replace(sample, '[:ALPHA:]{3}(\\d+)_\\d', '\\1'),
         depth = as.integer(depth),
         layer = case_when(depth == 900 ~ 'euxinic',
                           depth >= 103 & depth <= 237 ~ 'oxycline',
                           depth > 237 & depth < 900 ~ 'shallow.anoxic'),
         group = paste0(fraction, '_', layer)) %>%
  select(-c(fraction, depth, layer)) %>%
  mutate(group = str_replace(group, '(.*)_.*', '\\1')) %>%
  mutate(group = as_factor(group))

all(colnames(mapped_read_counts) == col_data$sample) # TRUE

# load a tibble containing the length in bp of each gene from all Cariaco MAGs
gl = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/gene_lengths_tibble.rda')

gl = gl %>%
  mutate(gene_length = gene_length / 1000) # convert length unit to kilobases

logTransformed_mapping_tpm = mapped_read_counts %>%
  rownames_to_column(var = 'gene_name') %>%
  as_tibble() %>%
  left_join(gl, by = 'gene_name') %>%
  relocate(gene_length, .after = 1) %>%
  mutate(across(3:last_col(), ~ .x / gene_length))


get_tpm_for = function(col) {
  scaling_factor = sum(col) / 1e6
  col = col / scaling_factor
  return(col)
}

logTransformed_mapping_tpm = logTransformed_mapping_tpm %>%
  select(-gene_length) %>%
  mutate(across(2:last_col(), ~ get_tpm_for(.x))) %>%
  mutate(across(2:last_col(), ~ log1p(.x)))

bgc = vroom::vroom(
  '/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/salmon_related_data_files/bgc_genes_from_10kb_clusters.txt',
  show_col_types = FALSE)

bgc = bgc %>%
  mutate(gene_name = str_replace(
    gene_name,
    '(^ctg\\d+_\\d+)-(MAG.*)', '\\2-\\1'
  ))

logTransformed_mapping_tpm = logTransformed_mapping_tpm %>%
  filter(gene_name %in% bgc$gene_name)



da_dds <- DESeq2::DESeqDataSetFromMatrix(mapped_read_counts,
                                         colData = col_data,
                                         design = ~ group)

register(MulticoreParam(40))

da_final_dds <- DESeq2::DESeq(da_dds, parallel = TRUE) # run the DESeq2 statistical model which assumes a negative binomial 

# pull Wald test results comparing the oxycline PA to FL fractions
res <- DESeq2::results(da_final_dds,
                       contrast = c('group',
                                    'PA',
                                    'FL'),
                       alpha = 0.05)


final_res <- res %>%
  as.data.frame() %>%
  rownames_to_column(var = 'mag_name') %>%
  as_tibble() %>%
  mutate(enrichment = case_when(
    log2FoldChange > 0 & padj < 0.05 ~ 'particle enriched',
    log2FoldChange < 0 & padj < 0.05 ~ 'free-living enriched',
    padj > 0.05 | is.na(padj) ~ 'no significant difference'))


col_metadata = tibble(sample = colnames(mapped_read_counts)[
  2:ncol(mapped_read_counts)
  ]) %>%
  mutate(fraction = if_else(str_detect(sample, 'PA'), 'PA', 'FL'),
         depth = str_replace(sample, '[:ALPHA:]{3}(\\d+)_\\d', '\\1'),
         depth = as.integer(depth),
         layer = case_when(depth == 900 ~ 'euxinic',
                           depth >= 103 & depth <= 237 ~ 'oxycline',
                           depth > 237 & depth < 900 ~ 'shallow.anoxic'))

mapped_read_counts = mapped_read_counts %>%
  rownames_to_column(var = 'gene_name') %>%
  as_tibble()

cs = mapped_read_counts %>%
  mutate(mag_name = str_replace(gene_name, '(MAG_\\d+)-.*', '\\1'), .before = 1)

cs = cs %>%
  mutate(
    fl_counts = rowSums(select(., col_metadata %>%
                                 filter(fraction == 'FL') %>%
                                 pull(sample))),
    pa_counts = rowSums(select(., col_metadata %>%
                                 filter(fraction == 'PA') %>%
                                 pull(sample)))
  ) %>%
  relocate(c(fl_counts, pa_counts), .after = gene_name) %>%
  select(1:8)



#bgc = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/bgc_kb_5000_gene_data.rda')


bgc_uniq = bgc %>%
  select(mag_name, gene_name) %>%
  distinct() %>%
  mutate(is_biosynthetic = TRUE)


sig_genes = res %>%
  as.data.frame() %>%
  rownames_to_column(var = 'gene_name') %>%
  as_tibble() %>%
  left_join(bgc_uniq %>%
              select(-mag_name),
            by = 'gene_name')

#pa_sig
sig_genes %>%
  filter(is_biosynthetic, log2FoldChange > 0, padj <= 0.05) %>%
  mutate(is_sig_pa_biosynthetic = TRUE) %>%
  select(gene_name, is_sig_pa_biosynthetic)


#fl_sig
sig_genes %>%
  filter(is_biosynthetic, log2FoldChange < 0, padj <= 0.05) %>%
  mutate(is_sig_fl_biosynthetic = TRUE) %>%
  select(gene_name, is_sig_fl_biosynthetic)




clust_analysis = pmap_df(list(
  list(sig_genes),

  list(sig_genes %>%
         filter(is_biosynthetic, log2FoldChange > 0, padj <= 0.05) %>%
         mutate(is_sig_pa_biosynthetic = TRUE) %>%
         select(gene_name, is_sig_pa_biosynthetic)),

  list(sig_genes %>%
         filter(is_biosynthetic, log2FoldChange < 0, padj <= 0.05) %>%
         mutate(is_sig_fl_biosynthetic = TRUE) %>%
         select(gene_name, is_sig_fl_biosynthetic)),

  list(
    cs %>% select(2:4)
  )
), ~
  ..1 %>%
  left_join(..2, by = 'gene_name') %>%
  left_join(..3, by = 'gene_name') %>%
  left_join(..4, by = 'gene_name')
)

clust_data_pa = clust_analysis %>%
  filter(is_sig_pa_biosynthetic |
           (pa_counts > 0 & fl_counts == 0 & is_biosynthetic)
  ) %>%
  relocate(c(fl_counts, pa_counts, padj,
             is_sig_pa_biosynthetic,
             is_sig_fl_biosynthetic,
             is_biosynthetic), .after = gene_name) %>%
  left_join(bgc, by = 'gene_name')

clust_data_fl = clust_analysis %>%
  filter(is_sig_fl_biosynthetic |
           (fl_counts > 0 & pa_counts == 0 & is_biosynthetic)
  ) %>%
  relocate(c(fl_counts, pa_counts, padj,
             is_sig_pa_biosynthetic,
             is_sig_fl_biosynthetic,
             is_biosynthetic), .after = gene_name) %>%
  left_join(bgc, by = 'gene_name')


bgc_summary = bgc %>%
  group_by(contig, region) %>%
  summarize(n = n(), .groups = 'drop')


pa_expressed = clust_data_pa %>%
  left_join(bgc_summary, by = c('contig', 'region')) %>%
  group_by(contig, region) %>%
  rename(cluster_size = n) %>%
  add_tally() %>%
  ungroup() %>%
  mutate(proportion_expressed = n / cluster_size)

pa_expressed_summary = pa_expressed %>%
  select(product, proportion_expressed,
         contig, region, cluster_size, n) %>%
  distinct()

fl_expressed = clust_data_fl %>%
  left_join(bgc_summary, by = c('contig', 'region')) %>%
  group_by(contig, region) %>%
  rename(cluster_size = n) %>%
  add_tally() %>%
  ungroup() %>%
  mutate(proportion_expressed = n / cluster_size)


fl_expressed_summary = fl_expressed %>%
  select(product, proportion_expressed,
         contig, region, cluster_size, n) %>%
  distinct()


####
map_chr(ls(), ~ paste0(.x, ': ', object.size(get(.x)) %>% format(units = "Mb")))

final_de_and_clust_data = pa_expressed %>%
  bind_rows(fl_expressed)

test_final_de = readRDS('~/Downloads/final_de_and_uniq_expr_clust_data.rds')


#PA
test_final_de %>%
  filter(
    (padj < 0.05 & log2FoldChange > 0) |
      (pa_counts > 0 & fl_counts == 0)) %>%
  left_join(
    mag_table %>%
      select(mag_name, phylum, class),
    by = 'mag_name') %>%
  pull(phylum) %>%
  table() %>%
  sort()


#FL
test_final_de %>%
  filter(
    (padj < 0.05 & log2FoldChange < 0) |
      (fl_counts > 0 & pa_counts == 0)) %>%
  left_join(
    mag_table %>%
      select(mag_name, phylum, class),
    by = 'mag_name') %>%
  pull(phylum) %>%
  table() %>%
  sort()


test_final_de %>%
  filter(pa_counts > 0 & fl_counts == 0) %>%
  nrow()

test_final_de %>%
  filter(fl_counts > 0 & pa_counts == 0) %>%
  nrow()


clust_gene_summary = readRDS('~/Documents/cariaco-tidy-analysis/cariaco_github_data_small/rdata/clust_gene_summary.rds')

allDaResults = allDaResults %>%
  map_dfr(~ .x)

allDaResults = allDaResults %>%
  group_by(mag_name) %>%
  mutate(strict_fraction_preference = case_when(
    all(preference == 'particle enriched') ~ 'PA',
    all(preference == 'free-living enriched') ~ 'FL',
    TRUE ~ 'no strict preference'), .after = 1) %>%
  ungroup() %>%
  arrange(phylum, mag_name)

strict_pa_mags = allDaResults %>%
  filter(strict_fraction_preference == 'PA') %>%
  pull(mag_name) %>%
  unique()

strict_fl_mags = allDaResults %>%
  filter(strict_fraction_preference == 'FL') %>%
  pull(mag_name) %>%
  unique()

# fl transporter summary tibble
fl_transporter_summary =
  clust_gene_summary %>%
  filter(mag_name %in% strict_fl_mags) %>%
  left_join(mag_table %>%
              select(mag_name, phylum),
            by = 'mag_name') %>%
  group_by(contig, product) %>%
  mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
         .after = product) %>%
  mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
  select(mag_name, phylum, contig, product, contains_transporter) %>%
  mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
  select(-contains_transporter) %>%
  ungroup() %>%
  rename(contains_transporter = temp) %>%
  distinct()



# pa transporter summary tibble
pa_transporter_summary =
  clust_gene_summary %>%
  filter(mag_name %in% strict_pa_mags) %>%
  left_join(mag_table %>%
              select(mag_name, phylum),
            by = 'mag_name') %>%
  group_by(contig, product) %>%
  mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
         .after = product) %>%
  mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
  select(mag_name, phylum, contig, product, contains_transporter) %>%
  mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
  select(-contains_transporter) %>%
  ungroup() %>%
  rename(contains_transporter = temp) %>%
  distinct()


###
pa_transporter_summary %>%
  group_by(phylum) %>%
  summarize(
    has_transporter = sum(contains_transporter),
    no_transporter = sum(!contains_transporter)
  )
###
fl_transporter_summary %>%
  group_by(phylum) %>%
  summarize(
    has_transporter = sum(contains_transporter),
    no_transporter =  sum(!contains_transporter)) %>%
  print(n = 30)
## # A tibble: 25 × 3
##    phylum           has_transporter no_transporter
##    <chr>                      <int>          <int>
##  1 AABM5-125-24                   0              4
##  2 Acidobacteriota                3             19
##  3 Aenigmarchaeota                1              0
##  4 Altiarchaeota                  1              0
##  5 Bacteroidota                  10              2
##  6 Campylobacterota               1              0
##  7 Chloroflexota                  6             20
##  8 Cyanobacteria                  2              0
##  9 Desulfobacterota              11             27
## 10 Gemmatimonadota                2              4
## 11 Iainarchaeota                  0              1
## 12 Latescibacterota               8             14
## 13 Marinisomatota                 5              2
## 14 Nanoarchaeota                 10             13
## 15 Nitrospinota                   1              5
## 16 Omnitrophota                  50             58
## 17 Patescibacteria                1              0
## 18 Planctomycetota                9             13
## 19 Proteobacteria                47             48
## 20 SAR324                         7             15
## 21 Spirochaetota                  0              1
## 22 Thermoplasmatota               1              0
## 23 UAP2                           0              1
## 24 UBA10199                       6              4
## 25 <NA>                           5              1
#####
logTransformed_tpm_for_bgc_genes_minimap2_res = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')


# temp_FL_rowAnno = temp_FL_rowAnno %>%
#   mutate(temp = as.numeric(contains_transporter))

bgc_genes_10kb =
  bgc_genes_10kb %>%
  mutate(gene_name = str_replace(gene_name, '(ctg.*_\\d+)-(MAG_\\d+)', '\\2-\\1'))


# tpm of each gene from clusters >= 10kb in size -- FL
temp_FL_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, contains('FL')) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(fl_transporter_summary %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  arrange(contains_transporter) %>%
  column_to_rownames(var = 'gene_name') %>%
  select(contains_transporter) %>%
  mutate(contains_transporter = as.numeric(contains_transporter))


temp_sum_FL_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, contains('FL')) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(fl_transporter_summary %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  select(-row_sums) %>%
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter) %>%
  column_to_rownames(var = 'temp') %>%
  select(contains_transporter) %>%
  mutate(contains_transporter = as.numeric(contains_transporter))
temp_sum_FL_rowAnno
temp_sum_PA_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, contains('PA')) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(pa_transporter_summary %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  select(-row_sums) %>%
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter) %>%
  column_to_rownames(var = 'temp') %>%
  select(contains_transporter) %>%
  mutate(contains_transporter = as.numeric(contains_transporter))


Supplementary Figure 5. Biosynthetic transcript expression of MAGs with strict PA and FL fraction preferences

# analysis by water layer -------------------------------------------------

# list of MAGs with FL fraction preference, one list per water layer
fl_fract_prefs_list = list(

  oxycline = allDaResults %>%
    filter(preference == 'free-living enriched', layer == 'oxycline', padj <= 0.01),

  shallow_anoxic = allDaResults %>%
    filter(preference == 'free-living enriched', layer == 'shallow_anoxic', padj <= 0.01),

  euxinic = allDaResults %>%
    filter(preference == 'free-living enriched', layer == 'euxinic', padj <= 0.01)
)

#View(fl_fract_prefs_list)

# list of MAGs with PA fraction preference, one list per water layer
pa_fract_prefs_list = list(

  oxycline = allDaResults %>%
    filter(preference == 'particle enriched', layer == 'oxycline', padj <= 0.01),

  shallow_anoxic = allDaResults %>%
    filter(preference == 'particle enriched', layer == 'shallow_anoxic', padj <= 0.01),

  euxinic = allDaResults %>%
    filter(preference == 'particle enriched', layer == 'euxinic', padj <= 0.01)
)

##
fl_transporter_list = fl_fract_prefs_list %>%
  map(~
  clust_gene_summary %>%
  filter(mag_name %in% .x$mag_name) %>%
  left_join(mag_table %>%
              select(mag_name, phylum),
            by = 'mag_name') %>%
  group_by(contig, product) %>%
  mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
         .after = product) %>%
  mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
  select(mag_name, phylum, contig, product, contains_transporter) %>%
  mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
  select(-contains_transporter) %>%
  ungroup() %>%
  rename(contains_transporter = temp) %>%
  distinct()
)

##
pa_transporter_list = pa_fract_prefs_list %>%
  map(~
        clust_gene_summary %>%
        filter(mag_name %in% .x$mag_name) %>%
        left_join(mag_table %>%
                    select(mag_name, phylum),
                  by = 'mag_name') %>%
        group_by(contig, product) %>%
        mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
               .after = product) %>%
        mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
        select(mag_name, phylum, contig, product, contains_transporter) %>%
        mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
        select(-contains_transporter) %>%
        ungroup() %>%
        rename(contains_transporter = temp) %>%
        distinct()
  )

# create color specifications for column annotations
anno_colors = list(
  Fraction = c(
    'PA' = 'darkgreen',
    'FL' = 'darkseagreen2'),
  Layer = c(
    'Oxycline' = 'darkslategray3', #'wheat2',
    'Shallow anoxic' = 'gold',# 'khaki2',
    'Euxinic' =  'sienna3'),

  Season = c('May' = 'slategray',
             'November' = 'slategray1'),

  `Depth (m)` = c(
    '103' = 'grey90',
    '148' = 'grey85',
    '198' = 'grey80',
    '200' = 'grey75',
    '234' = 'grey70',
    '237' = 'grey65',
    '247' = 'grey60',
    '267' = 'grey55',
    '295' = 'grey50',
    '314' = 'grey45',
    '900' = 'grey20'
  )
)


fl_prefer_expr_profile_oxy_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', c(103,148,198,200,234,237), '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(fl_transporter_list$oxycline %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter, row_sums) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            contains_transporter, row_sums)) %>%
  relocate(contains('PA')) %>%
  as.matrix()


  # create column annotation information object
  oxy_fl_col_anno = tibble(
    sample_name = colnames(fl_prefer_expr_profile_oxy_data),
    Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
    Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
    Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
  ) %>%
    mutate(Depth = as.integer(Depth),
           Layer = case_when(
             Depth < 247 ~ 'Oxycline',
             Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
             TRUE ~ 'Euxinic'
           )) %>%
    mutate(Depth = as.character(Depth)) %>%
    rename(`Depth (m)` = Depth) %>%
    column_to_rownames(var = 'sample_name') %>%
    select(-c(`Depth (m)`, Layer))


  fl_prefer_expr_profile_oxy_plot = fl_prefer_expr_profile_oxy_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 12,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = oxy_fl_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)),
    legend_breaks = seq(0, 6, by = 2),
    legend_labels = gt_render(c(
      '0',
      '6.39 x 10^0',
      '5.36 x 10^1',
      '4.02 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


pa_prefer_expr_profile_oxy_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', c(103,148,198,200,234,237), '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(
    bgc_genes_10kb %>%
      left_join(pa_transporter_list$oxycline %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter, row_sums) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            contains_transporter, row_sums)) %>%
  relocate(contains('PA')) %>%
  as.matrix()


# create column annotation information object
oxy_pa_col_anno = tibble(
  sample_name = colnames(pa_prefer_expr_profile_oxy_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


pa_prefer_expr_profile_oxy_plot = pa_prefer_expr_profile_oxy_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 12,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = oxy_pa_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)
      ),
    legend_breaks = 0:4,
    legend_labels = gt_render(c(
      '0',
      '1.72',
      '6.39',
      '19.09',
      '53.60')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


oxycline_frac_pref_expr_plot = plot_grid(
  NULL,
  grid.grabExpr(draw(
    pa_prefer_expr_profile_oxy_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  NULL,
  grid.grabExpr(draw(
    fl_prefer_expr_profile_oxy_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  nrow = 1,
  rel_widths = c(0.1, 1, 0.1, 1),
  labels = c('PA','', 'FL', '')
)



# bgc expr profiles of fl- and pa- pref mags in -- SHALLOW ANOXIC ---------------
fl_prefer_expr_profile_anox_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', c(247,267,295,314), '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(bgc_genes_10kb %>%
      left_join(fl_transporter_list$shallow_anoxic %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter, row_sums) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            row_sums, contains_transporter)) %>%
  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 2, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}()) %>%
  as.matrix()


# create column annotation information object
anox_fl_col_anno = tibble(
  sample_name = colnames(fl_prefer_expr_profile_anox_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


fl_prefer_expr_profile_anox_plot = fl_prefer_expr_profile_anox_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 8,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = anox_fl_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)),
    legend_breaks = seq(0, 6, by = 2),
    legend_labels = gt_render(c(
      '0',
      '6.39 x 10^0',
      '5.36 x 10^1',
      '4.02 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


pa_prefer_expr_profile_anox_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', c(247,267,295,314), '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(bgc_genes_10kb %>%
      left_join(pa_transporter_list$shallow_anoxic %>%
                  select(contig, product, contains_transporter),
                by = c('contig', 'product')) %>%
      filter(!is.na(contains_transporter)) %>%
      select(-c(cluster_kb, contig_edge)),
    by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(row_sums, contains_transporter) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            row_sums, contains_transporter)) %>%
  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 2, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}()) %>%
  as.matrix()


# create column annotation information object
anox_pa_col_anno = tibble(
  sample_name = colnames(pa_prefer_expr_profile_anox_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


pa_prefer_expr_profile_anox_plot = pa_prefer_expr_profile_anox_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 8,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = anox_pa_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)
      ),
    legend_breaks = 0:5,
    legend_labels = gt_render(c(
      '0',
      '1.72 x 10^0',
      '6.39 x 10^0',
      '1.91 x 10^1',
      '5.36 x 10^1',
      '1.47 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


anoxic_frac_pref_expr_plot = plot_grid(
  NULL,
  grid.grabExpr(draw(
    pa_prefer_expr_profile_anox_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  NULL,
  grid.grabExpr(draw(
    fl_prefer_expr_profile_anox_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  nrow = 1,
  rel_widths = c(0.1, 1, 0.1, 1),
  labels = c('PA','', 'FL', '')
)


# bgc expr profiles of fl- and pa- pref mags in -- EUXINIC ---------------
fl_prefer_expr_profile_eux_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', 900, '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(bgc_genes_10kb %>%
              left_join(fl_transporter_list$euxinic %>%
                          select(contig, product, contains_transporter),
                        by = c('contig', 'product')) %>%
              filter(!is.na(contains_transporter)) %>%
              select(-c(cluster_kb, contig_edge)),
            by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(contains_transporter, row_sums) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            row_sums, contains_transporter)) %>%
  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 1, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}()) %>%
  as.matrix()


# create column annotation information object
eux_fl_col_anno = tibble(
  sample_name = colnames(fl_prefer_expr_profile_eux_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


fl_prefer_expr_profile_eux_plot = fl_prefer_expr_profile_eux_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 4,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = eux_fl_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)),
    legend_breaks = seq(0, 6, by = 2),
    legend_labels = gt_render(c(
      '0',
      '6.39 x 10^0',
      '5.36 x 10^1',
      '4.02 x 10^2')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


pa_prefer_expr_profile_eux_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  select(gene_name, matches(paste0('.*', 900, '.*'))) %>%

  mutate(across(2:last_col(), ~ expm1(.x))) %>%
  mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
  left_join(bgc_genes_10kb %>%
              left_join(pa_transporter_list$euxinic %>%
                          select(contig, product, contains_transporter),
                        by = c('contig', 'product')) %>%
              filter(!is.na(contains_transporter)) %>%
              select(-c(cluster_kb, contig_edge)),
            by = 'gene_name') %>%
  filter(!is.na(contains_transporter)) %>%
  filter(row_sums > 0) %>%
  filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%  #
  group_by(mag_name, contig, region, product, contains_transporter) %>%
  summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
  #
  mutate(across(where(is.double), ~ log1p(.x))) %>%
  #
  mutate(temp = paste0(contig, ';', product), .before = 1) %>%
  arrange(row_sums, contains_transporter) %>%
  column_to_rownames(var = 'temp') %>%
  select(-c(mag_name, contig, region, product,
            row_sums, contains_transporter)) %>%
  relocate(colnames(.) %>%
             {function(x) {
               tbl = tibble(
                 mpa = x[str_detect(x, 'MPA')],
                 npa = x[str_detect(x, 'NPA')],
                 mfl = x[str_detect(x, 'MFL')],
                 nfl = x[str_detect(x, 'NFL')]
               ) %>%
                 mutate(groupper = rep(paste0('group_', seq(1, 1, by = 1)), each = 2)) %>%
                 group_by(groupper) %>%
                 summarize(across(everything(), ~
                                    paste0(.x, collapse = ';')),
                           .groups = 'drop') %>%
                 select(-groupper)

               pa = as.vector(rbind(tbl$mpa, tbl$npa))
               fl = as.vector(rbind(tbl$mfl, tbl$nfl))

               g = c(pa, fl)
               g = str_split(g, pattern = ';') %>% unlist()

               return(g)}}()) %>%
  as.matrix()


# create column annotation information object
eux_pa_col_anno = tibble(
  sample_name = colnames(pa_prefer_expr_profile_eux_data),
  Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
  Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
  Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
  mutate(Depth = as.integer(Depth),
         Layer = case_when(
           Depth < 247 ~ 'Oxycline',
           Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
           TRUE ~ 'Euxinic'
         )) %>%
  mutate(Depth = as.character(Depth)) %>%
  rename(`Depth (m)` = Depth) %>%
  column_to_rownames(var = 'sample_name') %>%
  select(-c(`Depth (m)`, Layer))


pa_prefer_expr_profile_eux_plot = pa_prefer_expr_profile_eux_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    gaps_col = 8,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    annotation_col = eux_pa_col_anno,
    annotation_colors = anno_colors,

    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          '#ebf4f7',
          '#aae6fa',
          '#f7e96d',
          '#f7e43b',
          'orange',
          '#ff6a00',
          'firebrick1',
          'darkorchid3',
          'purple4'))(3000)
      ),
    legend_breaks = 0:3,
    legend_labels = gt_render(c(
      '0',
      '1.72 x 10^0',
      '6.39 x 10^0',
      '1.91 x 10^1')),
    show_row_dend = FALSE,
    show_column_dend = FALSE
  )


euxinic_frac_pref_expr_plot = plot_grid(
  NULL,
  grid.grabExpr(draw(
    pa_prefer_expr_profile_eux_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  NULL,
  grid.grabExpr(draw(
    fl_prefer_expr_profile_eux_plot,
    merge_legend = TRUE,
    heatmap_legend_side = 'right',
    annotation_legend_side = 'right')),
  nrow = 1,
  rel_widths = c(0.1, 1, 0.1, 1),
  labels = c('PA','', 'FL', '')
)



all_bgc_expr_frac_pref_plots = plot_grid(
  NULL,
  oxycline_frac_pref_expr_plot,
  NULL,
  anoxic_frac_pref_expr_plot,
  NULL,
  euxinic_frac_pref_expr_plot,
  ncol = 2,
  nrow = 3,
  labels = c('a', '', 'b', '', 'c', ''),
  rel_widths = c(0.05, 1)
)
all_bgc_expr_frac_pref_plots


Supplementary Figure 6. Expression of biosynthetic transcripts from gene clusters

annotated as ladderanes

logTransformed_tpm_for_bgc_genes_minimap2_res = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')
ladderane_colAnno = readRDS('~/Documents/cariaco-tidy-analysis/cariaco_github_data_small/rdata/ladderane_colAnno.rds')

allDaResults_df = allDaResults %>%
  map_dfr(~ .x) %>%
  group_by(mag_name) %>%
  mutate(strict_fraction_preference = case_when(
    all(preference == 'free-living enriched') ~ 'FL',
    all(preference == 'particle enriched') ~ 'PA',
    TRUE ~ 'No strict preference')) %>%
  ungroup()

ladderane_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  left_join(
    bgc_genes_10kb %>%
      select(gene_name, product, contig),
    by = 'gene_name') %>%
  relocate(c(product, contig), .after = 1) %>%
  filter(product == 'ladderane') %>%
  mutate(row_sums = rowSums(.[4:ncol(.)]), .after = contig) %>%
  filter(row_sums > 0) %>%
  select(-c(gene_name, row_sums)) %>%
  group_by(contig, product) %>%
  summarize(across(everything(), ~ sum(.x)), .groups = 'drop') %>%
  mutate(temp = paste0(contig, ';', product),
         mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) %>%
  select(temp, mag_name) %>%
  left_join(
    mag_table %>%
      select(mag_name, phylum),
    by = 'mag_name') %>%
  left_join(
    allDaResults_df %>%
      select(mag_name, strict_fraction_preference) %>%
      distinct(),
    by = 'mag_name') %>%
  select(-mag_name) %>%
  column_to_rownames(var = 'temp') %>%
  rename(
    Phylum = phylum,
    `Fraction preference` = strict_fraction_preference)


ladderane_relPalette <- list(

  Layer = c(
    'Oxycline' = 'darkslategray3', #'wheat2',
    'Shallow anoxic' = 'gold',# 'khaki2',
    'Euxinic' =  'sienna3'),

  Fraction = c('PA' = 'darkgreen',
               'FL' = 'darkseagreen2'),

  Season = c('May' = 'slategray',
             'November' = 'slategray1'),

  `Depth (m)` = c(
    '103' = 'grey90',
    '148' = 'grey85',
    '198' = 'grey80',
    '200' = 'grey75',
    '234' = 'grey70',
    '237' = 'grey65',
    '247' = 'grey60',
    '267' = 'grey55',
    '295' = 'grey50',
    '314' = 'grey45',
    '900' = 'grey20'),

  Phylum = c('Planctomycetota' = 'khaki',
             'Desulfobacterota' = '#666666',
             'Myxococcota' = 'orange',
             'Fibrobacterota' = 'orchid1',
             'GCA-001730085' = 'slateblue1',
             'Latescibacterota' = 'aquamarine',
             'Omnitrophota' = 'darkviolet',
             'Verrucomicrobiota' = 'gainsboro',
             'UBP3' = 'deepskyblue'),

  `Fraction preference` = c('FL' = 'grey70',
                            'PA' = 'black',
                            'No strict preference' = 'white'))

ladderane_heat_data =
  logTransformed_tpm_for_bgc_genes_minimap2_res %>%
  left_join(
    bgc_genes_10kb %>%
      select(gene_name, product, contig),
    by = 'gene_name') %>%
  relocate(c(product, contig), .after = 1) %>%
  filter(product == 'ladderane') %>%
  mutate(row_sums = rowSums(.[4:ncol(.)]), .after = contig) %>%
  filter(row_sums > 0) %>%
  select(-c(gene_name, row_sums)) %>%
  mutate(across(where(is.numeric), ~ expm1(.x))) %>%
  group_by(contig, product) %>%
  summarize(across(everything(), ~ sum(.x)), .groups = 'drop') %>%
  mutate(across(where(is.numeric), ~ log1p(.x))) %>%
  mutate(temp = paste0(contig, ';', product)) %>%
  mutate(mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) %>%
  left_join(
    mag_table %>%
      select(mag_name, phylum),
    by = 'mag_name') %>%
  group_by(phylum) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  column_to_rownames(var = 'temp') %>%
  arrange(desc(n), phylum) %>%
  select(-c(contig, product, phylum, mag_name, n)) %>%
  relocate(colnames(.) %>%
    {function(x) {
      tbl = tibble(
        mpa = x[str_detect(x, 'MPA')],
        npa = x[str_detect(x, 'NPA')],
        mfl = x[str_detect(x, 'MFL')],
        nfl = x[str_detect(x, 'NFL')]
      ) %>%
        mutate(groupper = rep(paste0('group_', seq(1, 6, by = 1)), each = 2)) %>%
        group_by(groupper) %>%
        summarize(across(everything(), ~
                           paste0(.x, collapse = ';')),
                  .groups = 'drop') %>%
        select(-groupper)

      pa = as.vector(rbind(tbl$mpa, tbl$npa))
      fl = as.vector(rbind(tbl$mfl, tbl$nfl))

      g = c(fl, pa)
      g = str_split(g, pattern = ';') %>% unlist()

      return(g)}}()
    ) %>%
  as.matrix()

ladderane_colAnno = ladderane_colAnno %>%
  rownames_to_column(var = 'sample') %>%
  as_tibble() %>%
  arrange(factor(sample, levels = colnames(ladderane_heat_data))) %>%
  column_to_rownames(var = 'sample')


ladderane_rowAnno = ladderane_rowAnno %>%
  rownames_to_column(var = 'label') %>%
  as_tibble() %>%
  arrange(factor(label, levels = rownames(ladderane_heat_data))) %>%
  column_to_rownames(var = 'label')


ladderane_heat_plot = ladderane_heat_data %>%
  ComplexHeatmap::pheatmap(
    name = 'Transcripts per million',
    legend_breaks = 0:4,
    legend_labels = gt_render(c(
      '0',
      '1.72 x 10^0',
      '6.40 x 10^0',
      '1.91 x 10^1',
      '5.36 x 10^1')),
    fontsize = 8,
    cluster_cols = FALSE,
    cluster_rows = FALSE,
    show_rownames = FALSE,
    annotation_row = ladderane_rowAnno,
    annotation_col = ladderane_colAnno,
    annotation_colors = ladderane_relPalette,
    treeheight_row = 0,
    treeheight_col = 0,
    clustering_method = 'ward.D2',
    color =
      c(colorRampPalette(colors = 'white')(1),
        colorRampPalette(colors = c(
          #'white',
          #'gray100',
          #'gray100',
          #'gray99',
          #'gray98',
          '#ebf4f7',
          '#aae6fa',
          #'#c9e6f0',
          '#f7e96d',
          #'yellow',
          '#f7e43b',
          #'goldenrod1',
          'orange',
          '#ff6a00',
          'firebrick1',
          #'#D73027',
          'darkorchid3',
          #'purple3',
          'purple4'))(3000)))


ladderane_heat_plot_final = ComplexHeatmap::draw(
  ladderane_heat_plot,
  merge_legend = TRUE,
  heatmap_legend_side = 'right'
  #annotation_legend_side = 'right'
  )


Command line script to extract biosynthetic gene cluster predictions from the Cariaco MAGs using the antiSMASH 6.0 command line tool

#!/bin/bash
#SBATCH --partition=computer # Queue selection
#SBATCH --job-name=antismash6_array # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address@gmail.com # Where to send mail
#SBATCH --ntasks=36 # Run on 36 CPUs
#SBATCH --nodes=1 # one node
#SBATCH --cpus-per-task=1 # one CPU per parallel task
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/final_antismash_array_%A-%a.log# Standard output/error
#SBATCH --array= # fill this in with numbers uniquely affiliated with each of your MAGs
#SBATCH --qos=unlim
#
source activate anti6 # antismash 6.0 conda environment
GENOME_DIR=$(echo "path/to/intput/genome_fasta_files")
GBK_DIR=$(echo "path/to/output/genome_gene_prediction_gbk_files")
GFF_DIR=$(echo "path/to/output/genome/genome_gene_prediction_gff_files/files")
FASTA_DIR=$(echo "path/to/output/genome_gene_prediction_fasta_files")
OUT_DIR=$(echo "path/to/antismash/output/dir")

source activate gbk
cd "$GBK_DIR"

for FASTA in "$GENOME_DIR"/MAG_"$SLURM_ARRAY_TASK_ID".fa
do MAG=$(echo $FASTA | sed -r 's/.*(MAG_.*).fa/\1/')

#create gene predictions in FASTA format - used for rnaseq read alignment later
prodigal \
-i $FASTA \
-q \
-d "$FASTA_DIR"/"$MAG"-genes.fa

#create gene predictions in GFF format - to create GBK file antismash input
prodigal \
-i $FASTA \
-q \
-f gff \
-o "$GFF_DIR"/"$MAG".gff

#create GBK file that will be used as antismash input -- using gff_to_genbank.py script
python path/to/gff_to_genbank.py \
"$GFF_DIR"/"$MAG".gff $FASTA

mv "$GFF_DIR"/"$MAG".gb "$MAG".gbk

conda deactivate # deactivate conda env
source activate anti6 # activate antismash 6 conda env

antismash "$GBK_DIR"/"$MAG".gbk \
--output-dir "$OUT_DIR"/"$MAG" \
-c 36 \
--clusterhmmer \
--tigrfam \
--smcog-trees \
--cb-general \
--cb-subclusters \
--pfam2go \
--rre \
--cc-mibig \
--asf \
--allow-long-headers \
--genefinding-tool none \
--output-basename "$MAG"

done


Command line script to map metagenomic reads against Cariaco MAGs

#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=minimap2_metaG # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on 36 CPUs
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/minimap2_metaG_mapping_over_cariaco_genes_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim

source activate minimap2

cd /vortexfs1/omics/pachiadaki/Cariaco-data/Metagenomes

FILE_PATTERN=("D1b900AM" "D1b900BM" "D2a143A" "D2a143B" "D2a200A" "D2a200B" "D2a237A" "D2a237B" "D2a247A" "D2a247B" "D2a267A" "D2a267B" "D2a900AN" "D2a900BN" "D2b143A" "D2b143B" "D2b200A" "D2b200B" "D2b237A" "D2b237B" "D2b247A" "D2b247B" "D2b267A" "D2b267B" "D2b900AN" "D2b900BN" "D3a103A" "D3a103B" "D3a198A" "D3a198B" "D3a234A" "D3a234B" "D3a295A" "D3a295B" "D3a314A" "D3a314B" "D3a900AM" "D3a900BM" "D3b103A" "D3b103B" "D3b198A" "D3b198B" "D3b234A" "D3b234B" "D3b295A" "D3b295B" "D3b314A" "D3b314B")

FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}

REF_DIR=$(echo "/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX")

OUT_DIR=$(echo "/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaG_mapping_final_results_used")

minimap2 -t 36 -ax sr "$REF_DIR"/cariaco_minimap2_ref.mmi "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz > "$OUT_DIR"/"$FILE_PATTERN".sam

cd "$OUT_DIR"

paftools.js sam2paf "$FILE_PATTERN".sam > "$FILE_PATTERN".paf


Command line script to map metatranscriptomic reads against Cariaco MAGs

#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=minimap2_metaT # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on 36 CPUs
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/minimap2_metaT_mapping_over_cariaco_genes_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim

source activate minimap2

cd /vortexfs1/omics/pachiadaki/dgellermcgrath/all-files-from-scratch-2-8-2021/cariaco_metatranscriptomes

FILE_PATTERN=("MFL103_1" "MFL103_2" "MFL198_1" "MFL198_2" "MFL234_1" "MFL234_2" "MFL295_1" "MFL295_2" "MFL314_1" "MFL314_2" "MFL900_1" "MFL900_2" "MPA103_1" "MPA103_2" "MPA198_1" "MPA198_2" "MPA234_1" "MPA234_2" "MPA295_1" "MPA295_2" "MPA314_1" "MPA314_2" "MPA900_1" "MPA900_2" "NFL148_1" "NFL148_2" "NFL200_1" "NFL200_2" "NFL237_1" "NFL237_2" "NFL247_1" "NFL247_2" "NFL267_1" "NFL267_2" "NFL900_1" "NFL900_2" "NPA148_1" "NPA148_2" "NPA200_1" "NPA200_2" "NPA237_1" "NPA237_2" "NPA247_1" "NPA247_2" "NPA267_1" "NPA267_2" "NPA900_1" "NPA900_2")

FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}

minimap2 -t 36 -ax sr /vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX/cariaco_minimap2_ref.mmi "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz > /vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used/"$FILE_PATTERN".sam

cd /vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used/

paftools.js sam2paf "$FILE_PATTERN".sam > "$FILE_PATTERN".paf


Command line script to interactively run BiG-SCAPE on a HPC compute node on BGCs > 10kb in length

conda activate bigscape # conda env to run BiG-SCAPE

INPUT_GBK_DIR=$(echo "path/to/antismash6/BGC/gbk/files")

PFAM_DIR=$(echo "path/to/downloaded/pfam/directory")

python bigscape.py \
-i "$INPUT_GBK_DIR" \
-o cariaco_bigscape_results_FINAL \
-c 36 \
--pfam_dir "$PFAM_DIR" \
--mibig \
--clan_cutoff 0.3 0.7


Command line script to map reads to whole genomes, obtain counts for use as input for DESeq2 differential abundance analaysis

#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=coverM_genomeCounts # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on one CPU
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/coverm_mag_counts_of_reads_mapped_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim

source activate coverm

cd /vortexfs1/omics/pachiadaki/Cariaco-data/Metagenomes

GENOME_DIR=$(echo "path/to/genome/fasta/dir/")

OUT_DIR=$(echo "path/to/desired/output/dir")

FILE_PATTERN=("D1b900AM" "D1b900BM" "D2a143A" "D2a143B" "D2a200A" "D2a200B" "D2a237A" "D2a237B" "D2a247A" "D2a247B" "D2a267A" "D2a267B" "D2a900AN" "D2a900BN" "D2b143A" "D2b143B" "D2b200A" "D2b200B" "D2b237A" "D2b237B" "D2b247A" "D2b247B" "D2b267A" "D2b267B" "D2b900AN" "D2b900BN" "D3a103A" "D3a103B" "D3a198A" "D3a198B" "D3a234A" "D3a234B" "D3a295A" "D3a295B" "D3a314A" "D3a314B" "D3a900AM" "D3a900BM" "D3b103A" "D3b103B" "D3b198A" "D3b198B" "D3b234A" "D3b234B" "D3b295A" "D3b295B" "D3b314A" "D3b314B")

FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}

coverm genome \
--coupled "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz \
--genome-fasta-directory "$GENOME_DIR" \
--genome-fasta-extension fa \
--mapper bwa-mem \
--methods count \
--min-covered-fraction 0 \
--min-read-percent-identity 95 \
--min-read-aligned-percent 50 \
--exclude-supplementary \
--threads 36 \
-o "$OUT_DIR"/"$FILE_PATTERN"_counts_reads_mapped_per_genome.tsv


Command line script to map reads to whole genomes, obtain relative abundance information for Cariaco MAGS in each metagenomic sample

#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=coverM_relAbun # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on one CPU
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/coverm_mag_counts_of_reads_mapped_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim

source activate coverm

cd /vortexfs1/omics/pachiadaki/Cariaco-data/Metagenomes

GENOME_DIR=$(echo "path/to/genome/fasta/dir/")

OUT_DIR=$(echo "path/to/desired/output/dir")

FILE_PATTERN=("D1b900AM" "D1b900BM" "D2a143A" "D2a143B" "D2a200A" "D2a200B" "D2a237A" "D2a237B" "D2a247A" "D2a247B" "D2a267A" "D2a267B" "D2a900AN" "D2a900BN" "D2b143A" "D2b143B" "D2b200A" "D2b200B" "D2b237A" "D2b237B" "D2b247A" "D2b247B" "D2b267A" "D2b267B" "D2b900AN" "D2b900BN" "D3a103A" "D3a103B" "D3a198A" "D3a198B" "D3a234A" "D3a234B" "D3a295A" "D3a295B" "D3a314A" "D3a314B" "D3a900AM" "D3a900BM" "D3b103A" "D3b103B" "D3b198A" "D3b198B" "D3b234A" "D3b234B" "D3b295A" "D3b295B" "D3b314A" "D3b314B")

FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}

coverm genome \
--coupled "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz \
--genome-fasta-directory "$GENOME_DIR" \
--genome-fasta-extension fa \
--mapper bwa-mem \
--methods relative_abundance \
--min-read-percent-identity 95 \
--min-read-aligned-percent 50 \
--exclude-supplementary \
--threads 36 \
-o "$OUT_DIR"/"$FILE_PATTERN"_minReadAlignedPercent95.tsv


Command line interactive script to detect chimerism and contamination in ladderane BGCs using the GUNC command line tool

conda activate gunc # activate gunc conda environment

LADDERANE_DIR=$(echo "path/to/genomes/containing/predicted/ladderane/clusters/fasta/files/dir/")

GUNC_DB=$(echo "path/to/GUNC/db")

OUT_DIR=$(echo "path/to/desired/output/dir/")

for MAG in "$LADDERANE_DIR"
do gunc run \
-i "$MAG" \
-r "$GUNC_DB" \
--output-dir "$OUT_DIR"/"$MAG" \
--threads 36